diff --git "a/vssd_base_e300_with_mesa.txt" "b/vssd_base_e300_with_mesa.txt" new file mode 100644--- /dev/null +++ "b/vssd_base_e300_with_mesa.txt" @@ -0,0 +1,17881 @@ +[2024-07-24 23:03:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-24 23:03:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-24 23:04:23 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-24 23:04:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/e89.pth +[2024-07-24 23:04:33 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/e89.pth.................... +[2024-07-24 23:04:33 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-24 23:04:33 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-24 23:04:33 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/e89.pth' (epoch 89) +[2024-07-24 23:04:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-24 23:04:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][0/625] eta 2:33:01 lr 0.001026 wd 0.0500 time 14.6903 (14.6903) data time 7.8856 (7.8856) model time 0.0000 (0.0000) loss 11.0569 (11.0569) grad_norm 2.6674 (2.6674) loss_scale 16384.0000 (16384.0000) mem 26016MB +[2024-07-24 23:05:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][10/625] eta 0:19:59 lr 0.001026 wd 0.0500 time 0.5681 (1.9499) data time 0.0008 (0.7176) model time 0.0000 (0.0000) loss 7.9309 (9.3633) grad_norm 3.6014 (inf) loss_scale 8192.0000 (11915.6364) mem 22341MB +[2024-07-24 23:05:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][20/625] eta 0:13:02 lr 0.001026 wd 0.0500 time 0.5698 (1.2937) data time 0.0008 (0.3763) model time 0.0000 (0.0000) loss 9.1351 (9.1801) grad_norm 2.7483 (inf) loss_scale 8192.0000 (10142.4762) mem 22341MB +[2024-07-24 23:05:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][30/625] eta 0:10:30 lr 0.001026 wd 0.0500 time 0.5689 (1.0603) data time 0.0007 (0.2552) model time 0.0000 (0.0000) loss 8.2225 (9.2204) grad_norm 4.1673 (inf) loss_scale 8192.0000 (9513.2903) mem 22341MB +[2024-07-24 23:05:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][40/625] eta 0:09:10 lr 0.001026 wd 0.0500 time 0.5716 (0.9408) data time 0.0008 (0.1931) model time 0.0000 (0.0000) loss 9.4737 (9.1374) grad_norm 1.8847 (inf) loss_scale 8192.0000 (9191.0244) mem 22341MB +[2024-07-24 23:05:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][50/625] eta 0:08:21 lr 0.001026 wd 0.0500 time 0.7798 (0.8727) data time 0.0006 (0.1554) model time 0.0000 (0.0000) loss 10.5695 (9.1051) grad_norm 3.4288 (inf) loss_scale 8192.0000 (8995.1373) mem 22341MB +[2024-07-24 23:05:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][60/625] eta 0:07:46 lr 0.001026 wd 0.0500 time 0.5705 (0.8261) data time 0.0009 (0.1301) model time 0.5696 (0.5876) loss 9.5715 (9.0538) grad_norm 3.8678 (inf) loss_scale 8192.0000 (8863.4754) mem 22341MB +[2024-07-24 23:05:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][70/625] eta 0:07:19 lr 0.001025 wd 0.0500 time 0.5895 (0.7911) data time 0.0008 (0.1119) model time 0.5887 (0.5820) loss 9.0344 (9.0198) grad_norm 2.3028 (inf) loss_scale 8192.0000 (8768.9014) mem 22341MB +[2024-07-24 23:05:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][80/625] eta 0:06:56 lr 0.001025 wd 0.0500 time 0.5802 (0.7650) data time 0.0008 (0.0982) model time 0.5794 (0.5811) loss 7.8647 (9.0107) grad_norm 2.7489 (inf) loss_scale 8192.0000 (8697.6790) mem 22341MB +[2024-07-24 23:05:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][90/625] eta 0:06:38 lr 0.001025 wd 0.0500 time 0.5808 (0.7445) data time 0.0006 (0.0875) model time 0.5802 (0.5802) loss 10.5755 (9.0117) grad_norm 2.4383 (inf) loss_scale 8192.0000 (8642.1099) mem 22341MB +[2024-07-24 23:05:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][100/625] eta 0:06:22 lr 0.001025 wd 0.0500 time 0.5731 (0.7279) data time 0.0006 (0.0789) model time 0.5726 (0.5793) loss 8.7716 (9.0399) grad_norm 2.8939 (inf) loss_scale 8192.0000 (8597.5446) mem 22341MB +[2024-07-24 23:05:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][110/625] eta 0:06:07 lr 0.001025 wd 0.0500 time 0.5694 (0.7141) data time 0.0009 (0.0719) model time 0.5685 (0.5785) loss 8.0070 (9.0185) grad_norm 2.7348 (inf) loss_scale 8192.0000 (8561.0090) mem 22341MB +[2024-07-24 23:06:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][120/625] eta 0:05:54 lr 0.001025 wd 0.0500 time 0.5775 (0.7026) data time 0.0006 (0.0660) model time 0.5769 (0.5779) loss 7.5443 (9.0215) grad_norm 2.5170 (inf) loss_scale 8192.0000 (8530.5124) mem 22341MB +[2024-07-24 23:06:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][130/625] eta 0:05:42 lr 0.001025 wd 0.0500 time 0.5722 (0.6929) data time 0.0009 (0.0610) model time 0.5713 (0.5775) loss 9.0003 (9.0145) grad_norm 2.8097 (inf) loss_scale 8192.0000 (8504.6718) mem 22341MB +[2024-07-24 23:06:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][140/625] eta 0:05:32 lr 0.001025 wd 0.0500 time 0.5808 (0.6847) data time 0.0006 (0.0567) model time 0.5801 (0.5773) loss 8.3137 (9.0033) grad_norm 2.3818 (inf) loss_scale 8192.0000 (8482.4965) mem 22341MB +[2024-07-24 23:06:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][150/625] eta 0:05:21 lr 0.001025 wd 0.0500 time 0.5832 (0.6776) data time 0.0010 (0.0531) model time 0.5822 (0.5772) loss 7.8585 (8.9951) grad_norm 2.8085 (inf) loss_scale 8192.0000 (8463.2583) mem 22341MB +[2024-07-24 23:06:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][160/625] eta 0:05:12 lr 0.001025 wd 0.0500 time 0.5770 (0.6715) data time 0.0008 (0.0498) model time 0.5761 (0.5773) loss 10.0729 (9.0106) grad_norm 3.4342 (inf) loss_scale 8192.0000 (8446.4099) mem 22341MB +[2024-07-24 23:06:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][170/625] eta 0:05:03 lr 0.001025 wd 0.0500 time 0.5731 (0.6660) data time 0.0008 (0.0470) model time 0.5723 (0.5773) loss 8.6161 (9.0139) grad_norm 2.1672 (inf) loss_scale 8192.0000 (8431.5322) mem 22341MB +[2024-07-24 23:06:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][180/625] eta 0:04:54 lr 0.001025 wd 0.0500 time 0.5810 (0.6611) data time 0.0008 (0.0444) model time 0.5803 (0.5772) loss 10.1739 (8.9906) grad_norm 3.2241 (inf) loss_scale 8192.0000 (8418.2983) mem 22341MB +[2024-07-24 23:06:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][190/625] eta 0:04:45 lr 0.001025 wd 0.0500 time 0.5698 (0.6565) data time 0.0008 (0.0421) model time 0.5690 (0.5769) loss 8.4136 (8.9740) grad_norm 2.2587 (inf) loss_scale 8192.0000 (8406.4503) mem 22341MB +[2024-07-24 23:06:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][200/625] eta 0:04:37 lr 0.001025 wd 0.0500 time 0.5736 (0.6526) data time 0.0008 (0.0401) model time 0.5727 (0.5769) loss 7.9437 (8.9433) grad_norm 2.3782 (inf) loss_scale 8192.0000 (8395.7811) mem 22341MB +[2024-07-24 23:06:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][210/625] eta 0:04:29 lr 0.001024 wd 0.0500 time 0.5752 (0.6490) data time 0.0008 (0.0382) model time 0.5745 (0.5769) loss 9.5249 (8.9300) grad_norm 2.0150 (inf) loss_scale 8192.0000 (8386.1232) mem 22341MB +[2024-07-24 23:07:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][220/625] eta 0:04:21 lr 0.001024 wd 0.0500 time 0.5785 (0.6459) data time 0.0008 (0.0365) model time 0.5777 (0.5769) loss 8.4985 (8.9088) grad_norm 2.8784 (inf) loss_scale 8192.0000 (8377.3394) mem 22341MB +[2024-07-24 23:07:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][230/625] eta 0:04:13 lr 0.001024 wd 0.0500 time 0.5779 (0.6430) data time 0.0007 (0.0350) model time 0.5772 (0.5770) loss 6.8280 (8.9033) grad_norm 3.2492 (inf) loss_scale 8192.0000 (8369.3160) mem 22341MB +[2024-07-24 23:07:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][240/625] eta 0:04:06 lr 0.001024 wd 0.0500 time 0.5797 (0.6403) data time 0.0006 (0.0336) model time 0.5791 (0.5771) loss 8.2084 (8.8956) grad_norm 2.9839 (inf) loss_scale 8192.0000 (8361.9585) mem 22341MB +[2024-07-24 23:07:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][250/625] eta 0:03:59 lr 0.001024 wd 0.0500 time 0.5758 (0.6377) data time 0.0006 (0.0323) model time 0.5752 (0.5770) loss 8.9934 (8.8876) grad_norm 3.4557 (inf) loss_scale 8192.0000 (8355.1873) mem 22341MB +[2024-07-24 23:07:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][260/625] eta 0:03:51 lr 0.001024 wd 0.0500 time 0.5739 (0.6353) data time 0.0006 (0.0311) model time 0.5733 (0.5769) loss 8.7796 (8.8642) grad_norm 2.2002 (inf) loss_scale 8192.0000 (8348.9349) mem 22341MB +[2024-07-24 23:07:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][270/625] eta 0:03:44 lr 0.001024 wd 0.0500 time 0.5729 (0.6332) data time 0.0006 (0.0299) model time 0.5723 (0.5769) loss 9.2037 (8.8589) grad_norm 2.7557 (inf) loss_scale 8192.0000 (8343.1439) mem 22341MB +[2024-07-24 23:07:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][280/625] eta 0:03:38 lr 0.001024 wd 0.0500 time 0.5784 (0.6329) data time 0.0008 (0.0289) model time 0.5777 (0.5789) loss 9.0132 (8.8724) grad_norm 2.4230 (inf) loss_scale 8192.0000 (8337.7651) mem 22341MB +[2024-07-24 23:07:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][290/625] eta 0:03:31 lr 0.001024 wd 0.0500 time 0.5772 (0.6310) data time 0.0006 (0.0279) model time 0.5765 (0.5788) loss 7.0565 (8.8576) grad_norm 2.3026 (inf) loss_scale 8192.0000 (8332.7560) mem 22341MB +[2024-07-24 23:07:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][300/625] eta 0:03:24 lr 0.001024 wd 0.0500 time 0.5721 (0.6293) data time 0.0008 (0.0270) model time 0.5713 (0.5787) loss 7.2341 (8.8354) grad_norm 1.9477 (inf) loss_scale 8192.0000 (8328.0797) mem 22341MB +[2024-07-24 23:07:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][310/625] eta 0:03:17 lr 0.001024 wd 0.0500 time 0.5782 (0.6276) data time 0.0008 (0.0262) model time 0.5774 (0.5787) loss 9.5312 (8.8361) grad_norm 2.1166 (inf) loss_scale 8192.0000 (8323.7042) mem 22341MB +[2024-07-24 23:07:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][320/625] eta 0:03:10 lr 0.001024 wd 0.0500 time 0.5767 (0.6260) data time 0.0006 (0.0254) model time 0.5762 (0.5786) loss 10.5362 (8.8584) grad_norm 2.8076 (inf) loss_scale 8192.0000 (8319.6012) mem 22341MB +[2024-07-24 23:08:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][330/625] eta 0:03:04 lr 0.001024 wd 0.0500 time 0.5758 (0.6245) data time 0.0006 (0.0247) model time 0.5752 (0.5785) loss 7.3990 (8.8516) grad_norm 2.0770 (inf) loss_scale 8192.0000 (8315.7462) mem 22341MB +[2024-07-24 23:08:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][340/625] eta 0:02:57 lr 0.001023 wd 0.0500 time 0.5739 (0.6231) data time 0.0008 (0.0240) model time 0.5731 (0.5784) loss 9.4042 (8.8553) grad_norm 2.3137 (inf) loss_scale 8192.0000 (8312.1173) mem 22341MB +[2024-07-24 23:08:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][350/625] eta 0:02:50 lr 0.001023 wd 0.0500 time 0.5732 (0.6217) data time 0.0007 (0.0233) model time 0.5725 (0.5782) loss 7.6326 (8.8562) grad_norm 4.0680 (inf) loss_scale 8192.0000 (8308.6952) mem 22341MB +[2024-07-24 23:08:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][360/625] eta 0:02:44 lr 0.001023 wd 0.0500 time 0.5717 (0.6205) data time 0.0007 (0.0227) model time 0.5710 (0.5782) loss 8.3663 (8.8404) grad_norm 2.5738 (inf) loss_scale 8192.0000 (8305.4626) mem 22341MB +[2024-07-24 23:08:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][370/625] eta 0:02:37 lr 0.001023 wd 0.0500 time 0.5783 (0.6194) data time 0.0009 (0.0221) model time 0.5774 (0.5782) loss 8.4384 (8.8395) grad_norm 2.6509 (inf) loss_scale 8192.0000 (8302.4043) mem 22341MB +[2024-07-24 23:08:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][380/625] eta 0:02:31 lr 0.001023 wd 0.0500 time 0.5794 (0.6184) data time 0.0007 (0.0215) model time 0.5787 (0.5782) loss 6.2908 (8.8304) grad_norm 1.9291 (inf) loss_scale 8192.0000 (8299.5066) mem 22341MB +[2024-07-24 23:08:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][390/625] eta 0:02:25 lr 0.001023 wd 0.0500 time 0.5775 (0.6173) data time 0.0007 (0.0210) model time 0.5769 (0.5782) loss 10.4164 (8.8208) grad_norm 1.7616 (inf) loss_scale 8192.0000 (8296.7570) mem 22341MB +[2024-07-24 23:08:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][400/625] eta 0:02:18 lr 0.001023 wd 0.0500 time 0.5735 (0.6162) data time 0.0008 (0.0205) model time 0.5726 (0.5780) loss 9.3537 (8.8209) grad_norm 2.2944 (inf) loss_scale 8192.0000 (8294.1446) mem 22341MB +[2024-07-24 23:08:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][410/625] eta 0:02:12 lr 0.001023 wd 0.0500 time 0.5788 (0.6153) data time 0.0009 (0.0200) model time 0.5780 (0.5780) loss 9.1568 (8.8222) grad_norm 1.6582 (inf) loss_scale 8192.0000 (8291.6594) mem 22341MB +[2024-07-24 23:08:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][420/625] eta 0:02:05 lr 0.001023 wd 0.0500 time 0.5742 (0.6144) data time 0.0006 (0.0196) model time 0.5736 (0.5779) loss 11.0168 (8.8222) grad_norm 3.7077 (inf) loss_scale 8192.0000 (8289.2922) mem 22341MB +[2024-07-24 23:09:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][430/625] eta 0:01:59 lr 0.001023 wd 0.0500 time 0.5748 (0.6135) data time 0.0006 (0.0191) model time 0.5742 (0.5779) loss 8.9909 (8.8259) grad_norm 2.5506 (inf) loss_scale 8192.0000 (8287.0348) mem 22341MB +[2024-07-24 23:09:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][440/625] eta 0:01:53 lr 0.001023 wd 0.0500 time 0.5776 (0.6127) data time 0.0007 (0.0187) model time 0.5769 (0.5779) loss 10.0817 (8.8293) grad_norm 1.7901 (inf) loss_scale 8192.0000 (8284.8798) mem 22341MB +[2024-07-24 23:09:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][450/625] eta 0:01:47 lr 0.001023 wd 0.0500 time 0.5801 (0.6119) data time 0.0007 (0.0183) model time 0.5794 (0.5779) loss 7.9057 (8.8226) grad_norm 2.1102 (inf) loss_scale 8192.0000 (8282.8204) mem 22341MB +[2024-07-24 23:09:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][460/625] eta 0:01:40 lr 0.001023 wd 0.0500 time 0.5744 (0.6112) data time 0.0008 (0.0179) model time 0.5736 (0.5779) loss 9.9784 (8.8068) grad_norm 2.2618 (inf) loss_scale 8192.0000 (8280.8503) mem 22341MB +[2024-07-24 23:09:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][470/625] eta 0:01:34 lr 0.001022 wd 0.0500 time 0.5757 (0.6105) data time 0.0008 (0.0176) model time 0.5748 (0.5778) loss 8.4481 (8.7890) grad_norm 1.8125 (inf) loss_scale 8192.0000 (8278.9639) mem 22341MB +[2024-07-24 23:09:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][480/625] eta 0:01:28 lr 0.001022 wd 0.0500 time 0.5759 (0.6098) data time 0.0007 (0.0172) model time 0.5751 (0.5777) loss 8.0049 (8.7811) grad_norm 2.0818 (inf) loss_scale 8192.0000 (8277.1559) mem 22341MB +[2024-07-24 23:09:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][490/625] eta 0:01:22 lr 0.001022 wd 0.0500 time 0.5204 (0.6094) data time 0.0009 (0.0169) model time 0.5195 (0.5781) loss 8.7924 (8.7800) grad_norm 2.3945 (inf) loss_scale 8192.0000 (8275.4216) mem 22341MB +[2024-07-24 23:09:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][500/625] eta 0:01:16 lr 0.001022 wd 0.0500 time 0.5841 (0.6098) data time 0.0008 (0.0166) model time 0.5833 (0.5791) loss 9.1620 (8.7755) grad_norm 2.2217 (inf) loss_scale 8192.0000 (8273.7565) mem 22341MB +[2024-07-24 23:09:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][510/625] eta 0:01:10 lr 0.001022 wd 0.0500 time 0.5754 (0.6091) data time 0.0006 (0.0163) model time 0.5748 (0.5791) loss 9.2269 (8.7819) grad_norm 2.8494 (inf) loss_scale 8192.0000 (8272.1566) mem 22341MB +[2024-07-24 23:09:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][520/625] eta 0:01:03 lr 0.001022 wd 0.0500 time 0.5756 (0.6085) data time 0.0006 (0.0160) model time 0.5750 (0.5790) loss 7.8728 (8.7795) grad_norm 3.5692 (inf) loss_scale 8192.0000 (8270.6180) mem 22341MB +[2024-07-24 23:10:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][530/625] eta 0:00:57 lr 0.001022 wd 0.0500 time 0.5776 (0.6079) data time 0.0008 (0.0157) model time 0.5768 (0.5790) loss 9.1347 (8.7751) grad_norm 1.7865 (inf) loss_scale 8192.0000 (8269.1375) mem 22341MB +[2024-07-24 23:10:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][540/625] eta 0:00:51 lr 0.001022 wd 0.0500 time 0.5761 (0.6073) data time 0.0008 (0.0154) model time 0.5753 (0.5789) loss 8.6561 (8.7734) grad_norm 3.8178 (inf) loss_scale 8192.0000 (8267.7116) mem 22341MB +[2024-07-24 23:10:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][550/625] eta 0:00:45 lr 0.001022 wd 0.0500 time 0.5755 (0.6067) data time 0.0006 (0.0151) model time 0.5749 (0.5788) loss 9.3440 (8.7711) grad_norm 2.0231 (inf) loss_scale 8192.0000 (8266.3376) mem 22341MB +[2024-07-24 23:10:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][560/625] eta 0:00:39 lr 0.001022 wd 0.0500 time 0.5725 (0.6062) data time 0.0008 (0.0149) model time 0.5718 (0.5787) loss 9.5030 (8.7745) grad_norm 2.0091 (inf) loss_scale 8192.0000 (8265.0125) mem 22341MB +[2024-07-24 23:10:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][570/625] eta 0:00:33 lr 0.001022 wd 0.0500 time 0.5649 (0.6056) data time 0.0009 (0.0146) model time 0.5641 (0.5786) loss 9.3059 (8.7790) grad_norm 2.6207 (inf) loss_scale 4096.0000 (8235.0403) mem 22341MB +[2024-07-24 23:10:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][580/625] eta 0:00:27 lr 0.001022 wd 0.0500 time 0.5769 (0.6051) data time 0.0007 (0.0144) model time 0.5762 (0.5785) loss 9.2017 (8.7742) grad_norm 2.0360 (inf) loss_scale 4096.0000 (8163.8003) mem 22341MB +[2024-07-24 23:10:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][590/625] eta 0:00:21 lr 0.001022 wd 0.0500 time 0.5733 (0.6046) data time 0.0008 (0.0142) model time 0.5726 (0.5785) loss 9.2003 (8.7741) grad_norm 1.8565 (inf) loss_scale 4096.0000 (8094.9712) mem 22341MB +[2024-07-24 23:10:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][600/625] eta 0:00:15 lr 0.001021 wd 0.0500 time 0.5764 (0.6042) data time 0.0008 (0.0139) model time 0.5757 (0.5785) loss 6.8561 (8.7705) grad_norm 2.5663 (inf) loss_scale 4096.0000 (8028.4326) mem 22341MB +[2024-07-24 23:10:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][610/625] eta 0:00:09 lr 0.001021 wd 0.0500 time 0.5722 (0.6038) data time 0.0006 (0.0137) model time 0.5716 (0.5784) loss 8.7312 (8.7670) grad_norm 2.1149 (inf) loss_scale 4096.0000 (7964.0720) mem 22341MB +[2024-07-24 23:10:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [90/300][620/625] eta 0:00:03 lr 0.001021 wd 0.0500 time 0.5710 (0.6033) data time 0.0004 (0.0135) model time 0.5706 (0.5783) loss 9.1690 (8.7696) grad_norm 2.1037 (inf) loss_scale 4096.0000 (7901.7842) mem 22341MB +[2024-07-24 23:10:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 90 training takes 0:06:16 +[2024-07-24 23:10:55 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-24 23:10:56 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-24 23:10:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.568 (0.568) Loss 0.5640 (0.5640) Acc@1 88.135 (88.135) Acc@5 98.242 (98.242) Mem 22341MB +[2024-07-24 23:10:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.166) Loss 0.9751 (0.7205) Acc@1 77.393 (84.202) Acc@5 94.629 (97.048) Mem 22341MB +[2024-07-24 23:10:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.147) Loss 1.0723 (0.8573) Acc@1 74.707 (80.838) Acc@5 93.506 (95.499) Mem 22341MB +[2024-07-24 23:11:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 80.550 Acc@5 95.493 +[2024-07-24 23:11:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 80.6% +[2024-07-24 23:11:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.926 (0.926) Loss 0.5581 (0.5581) Acc@1 89.014 (89.014) Acc@5 98.486 (98.486) Mem 22341MB +[2024-07-24 23:11:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.124 (0.198) Loss 0.8745 (0.6891) Acc@1 79.883 (85.684) Acc@5 95.850 (97.550) Mem 22341MB +[2024-07-24 23:11:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.163) Loss 0.9800 (0.8024) Acc@1 76.172 (82.350) Acc@5 94.873 (96.310) Mem 22341MB +[2024-07-24 23:11:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.078 Acc@5 96.341 +[2024-07-24 23:11:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.1% +[2024-07-24 23:11:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.08% +[2024-07-24 23:11:06 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-24 23:11:08 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-24 23:11:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][0/625] eta 0:14:33 lr 0.001021 wd 0.0500 time 1.3979 (1.3979) data time 0.7563 (0.7563) model time 0.0000 (0.0000) loss 7.6764 (7.6764) grad_norm 2.0643 (2.0643) loss_scale 4096.0000 (4096.0000) mem 22337MB +[2024-07-24 23:11:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][10/625] eta 0:06:38 lr 0.001021 wd 0.0500 time 0.5804 (0.6487) data time 0.0006 (0.0695) model time 0.0000 (0.0000) loss 7.5686 (8.4513) grad_norm 1.7875 (2.3548) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:11:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][20/625] eta 0:06:12 lr 0.001021 wd 0.0500 time 0.5703 (0.6156) data time 0.0008 (0.0368) model time 0.0000 (0.0000) loss 8.3941 (8.2233) grad_norm 2.4059 (2.5735) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:11:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][30/625] eta 0:05:58 lr 0.001021 wd 0.0500 time 0.5706 (0.6025) data time 0.0009 (0.0252) model time 0.0000 (0.0000) loss 7.0706 (8.2709) grad_norm 2.2714 (2.4959) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:11:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][40/625] eta 0:05:48 lr 0.001021 wd 0.0500 time 0.5823 (0.5957) data time 0.0006 (0.0192) model time 0.0000 (0.0000) loss 8.2136 (8.2765) grad_norm 2.9643 (2.5145) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:11:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][50/625] eta 0:05:40 lr 0.001021 wd 0.0500 time 0.5807 (0.5917) data time 0.0006 (0.0156) model time 0.0000 (0.0000) loss 8.1722 (8.3323) grad_norm 2.5358 (2.4683) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:11:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][60/625] eta 0:05:32 lr 0.001021 wd 0.0500 time 0.5783 (0.5890) data time 0.0008 (0.0132) model time 0.5775 (0.5748) loss 8.1858 (8.3941) grad_norm 1.7192 (2.3950) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:11:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][70/625] eta 0:05:25 lr 0.001021 wd 0.0500 time 0.5741 (0.5870) data time 0.0006 (0.0114) model time 0.5735 (0.5742) loss 6.5427 (8.3818) grad_norm 1.9382 (2.3879) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:11:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][80/625] eta 0:05:18 lr 0.001021 wd 0.0500 time 0.5710 (0.5853) data time 0.0007 (0.0101) model time 0.5703 (0.5737) loss 9.1375 (8.3861) grad_norm 2.0615 (2.3418) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:12:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][90/625] eta 0:05:13 lr 0.001021 wd 0.0500 time 0.6837 (0.5865) data time 0.0009 (0.0091) model time 0.6828 (0.5791) loss 8.6604 (8.3516) grad_norm 1.7998 (2.2868) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:12:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][100/625] eta 0:05:09 lr 0.001021 wd 0.0500 time 0.5748 (0.5893) data time 0.0006 (0.0083) model time 0.5742 (0.5861) loss 8.8487 (8.3273) grad_norm 1.7373 (2.2604) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:12:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][110/625] eta 0:05:02 lr 0.001020 wd 0.0500 time 0.5755 (0.5879) data time 0.0006 (0.0076) model time 0.5749 (0.5839) loss 9.9101 (8.3952) grad_norm 2.7651 (2.2664) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:12:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][120/625] eta 0:04:56 lr 0.001020 wd 0.0500 time 0.5741 (0.5869) data time 0.0006 (0.0071) model time 0.5734 (0.5826) loss 9.1616 (8.4348) grad_norm 2.4770 (2.2746) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:12:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][130/625] eta 0:04:50 lr 0.001020 wd 0.0500 time 0.5711 (0.5859) data time 0.0006 (0.0066) model time 0.5705 (0.5813) loss 9.8316 (8.4405) grad_norm 2.5231 (2.2565) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:12:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][140/625] eta 0:04:43 lr 0.001020 wd 0.0500 time 0.5895 (0.5852) data time 0.0010 (0.0062) model time 0.5884 (0.5806) loss 9.4172 (8.4805) grad_norm 1.8421 (2.2611) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:12:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][150/625] eta 0:04:37 lr 0.001020 wd 0.0500 time 0.5730 (0.5844) data time 0.0009 (0.0058) model time 0.5721 (0.5799) loss 9.6588 (8.4933) grad_norm 2.4209 (2.2459) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:12:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][160/625] eta 0:04:31 lr 0.001020 wd 0.0500 time 0.5754 (0.5842) data time 0.0008 (0.0055) model time 0.5747 (0.5799) loss 8.4100 (8.4812) grad_norm 2.0049 (2.2314) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:12:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][170/625] eta 0:04:25 lr 0.001020 wd 0.0500 time 0.5728 (0.5836) data time 0.0006 (0.0052) model time 0.5722 (0.5793) loss 6.8602 (8.5063) grad_norm 2.4501 (2.2395) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:12:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][180/625] eta 0:04:19 lr 0.001020 wd 0.0500 time 0.5744 (0.5831) data time 0.0006 (0.0050) model time 0.5738 (0.5789) loss 7.8778 (8.5054) grad_norm 1.7001 (2.2388) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:12:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][190/625] eta 0:04:13 lr 0.001020 wd 0.0500 time 0.5585 (0.5828) data time 0.0007 (0.0048) model time 0.5578 (0.5787) loss 9.6358 (8.4930) grad_norm 2.9849 (2.2622) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:13:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][200/625] eta 0:04:07 lr 0.001020 wd 0.0500 time 0.5745 (0.5824) data time 0.0009 (0.0046) model time 0.5736 (0.5784) loss 6.8874 (8.4830) grad_norm 3.6286 (2.2730) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:13:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][210/625] eta 0:04:01 lr 0.001020 wd 0.0500 time 0.5741 (0.5821) data time 0.0006 (0.0044) model time 0.5735 (0.5782) loss 9.7137 (8.4850) grad_norm 2.4852 (2.2601) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:13:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][220/625] eta 0:03:55 lr 0.001020 wd 0.0500 time 0.5812 (0.5817) data time 0.0006 (0.0042) model time 0.5806 (0.5779) loss 8.7213 (8.4653) grad_norm 1.7914 (2.2538) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:13:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][230/625] eta 0:03:49 lr 0.001020 wd 0.0500 time 0.5727 (0.5814) data time 0.0008 (0.0041) model time 0.5719 (0.5777) loss 9.1781 (8.4746) grad_norm 2.2224 (2.2659) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:13:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][240/625] eta 0:03:44 lr 0.001019 wd 0.0500 time 0.7622 (0.5819) data time 0.0009 (0.0039) model time 0.7613 (0.5785) loss 10.5080 (8.4845) grad_norm 1.6230 (2.2619) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:13:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][250/625] eta 0:03:38 lr 0.001019 wd 0.0500 time 0.5796 (0.5815) data time 0.0008 (0.0038) model time 0.5788 (0.5781) loss 7.5145 (8.4778) grad_norm 2.9878 (2.2762) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:13:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][260/625] eta 0:03:32 lr 0.001019 wd 0.0500 time 0.5755 (0.5812) data time 0.0006 (0.0037) model time 0.5748 (0.5779) loss 9.2726 (8.4767) grad_norm 1.5876 (2.2815) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:13:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][270/625] eta 0:03:26 lr 0.001019 wd 0.0500 time 0.5770 (0.5810) data time 0.0008 (0.0036) model time 0.5763 (0.5778) loss 8.3831 (8.4612) grad_norm 2.2509 (2.2802) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:13:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][280/625] eta 0:03:20 lr 0.001019 wd 0.0500 time 0.5692 (0.5810) data time 0.0008 (0.0035) model time 0.5684 (0.5778) loss 9.6628 (8.4616) grad_norm 2.7261 (2.2896) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:13:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][290/625] eta 0:03:14 lr 0.001019 wd 0.0500 time 0.5742 (0.5807) data time 0.0006 (0.0034) model time 0.5736 (0.5776) loss 8.8575 (8.4538) grad_norm 1.5314 (2.2819) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:14:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][300/625] eta 0:03:08 lr 0.001019 wd 0.0500 time 0.5763 (0.5805) data time 0.0008 (0.0033) model time 0.5755 (0.5774) loss 8.0659 (8.4625) grad_norm 1.9316 (2.2745) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:14:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][310/625] eta 0:03:03 lr 0.001019 wd 0.0500 time 0.7411 (0.5812) data time 0.0007 (0.0032) model time 0.7404 (0.5783) loss 7.8121 (8.4634) grad_norm 1.8855 (2.2645) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:14:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][320/625] eta 0:02:57 lr 0.001019 wd 0.0500 time 0.5741 (0.5825) data time 0.0008 (0.0032) model time 0.5733 (0.5799) loss 9.0934 (8.4570) grad_norm 1.6349 (2.2609) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:14:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][330/625] eta 0:02:51 lr 0.001019 wd 0.0500 time 0.5757 (0.5823) data time 0.0006 (0.0031) model time 0.5752 (0.5798) loss 8.2089 (8.4442) grad_norm 2.7929 (2.2561) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:14:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][340/625] eta 0:02:45 lr 0.001019 wd 0.0500 time 0.5753 (0.5821) data time 0.0010 (0.0030) model time 0.5743 (0.5796) loss 9.6743 (8.4458) grad_norm 1.8279 (2.2495) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:14:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][350/625] eta 0:02:40 lr 0.001019 wd 0.0500 time 0.5866 (0.5819) data time 0.0008 (0.0030) model time 0.5858 (0.5794) loss 9.5005 (8.4648) grad_norm 2.1524 (2.2461) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:14:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][360/625] eta 0:02:34 lr 0.001019 wd 0.0500 time 0.5772 (0.5817) data time 0.0007 (0.0029) model time 0.5764 (0.5792) loss 7.2719 (8.4653) grad_norm 1.8828 (2.2397) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:14:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][370/625] eta 0:02:28 lr 0.001018 wd 0.0500 time 0.5731 (0.5815) data time 0.0009 (0.0028) model time 0.5723 (0.5790) loss 8.2686 (8.4619) grad_norm 2.1219 (2.2394) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:14:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][380/625] eta 0:02:22 lr 0.001018 wd 0.0500 time 0.5700 (0.5813) data time 0.0010 (0.0028) model time 0.5690 (0.5789) loss 9.2562 (8.4619) grad_norm 2.5471 (2.2476) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:14:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][390/625] eta 0:02:16 lr 0.001018 wd 0.0500 time 0.5774 (0.5811) data time 0.0006 (0.0027) model time 0.5767 (0.5787) loss 7.4775 (8.4664) grad_norm 2.3175 (2.2541) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:15:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][400/625] eta 0:02:10 lr 0.001018 wd 0.0500 time 0.5806 (0.5810) data time 0.0008 (0.0027) model time 0.5798 (0.5786) loss 8.4107 (8.4506) grad_norm 1.6019 (2.2501) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:15:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][410/625] eta 0:02:04 lr 0.001018 wd 0.0500 time 0.5800 (0.5809) data time 0.0006 (0.0026) model time 0.5794 (0.5785) loss 9.5405 (8.4463) grad_norm 2.0408 (2.2417) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:15:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][420/625] eta 0:01:59 lr 0.001018 wd 0.0500 time 0.5770 (0.5807) data time 0.0008 (0.0026) model time 0.5763 (0.5784) loss 8.4795 (8.4483) grad_norm 1.8458 (2.2313) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:15:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][430/625] eta 0:01:53 lr 0.001018 wd 0.0500 time 0.5739 (0.5806) data time 0.0008 (0.0026) model time 0.5731 (0.5783) loss 7.8530 (8.4457) grad_norm 1.5026 (2.2279) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:15:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][440/625] eta 0:01:47 lr 0.001018 wd 0.0500 time 0.5757 (0.5805) data time 0.0007 (0.0025) model time 0.5750 (0.5782) loss 7.0517 (8.4434) grad_norm 2.5135 (2.2245) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:15:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][450/625] eta 0:01:41 lr 0.001018 wd 0.0500 time 0.5748 (0.5804) data time 0.0008 (0.0025) model time 0.5741 (0.5781) loss 7.7430 (8.4333) grad_norm 2.1294 (2.2251) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:15:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][460/625] eta 0:01:35 lr 0.001018 wd 0.0500 time 0.5771 (0.5803) data time 0.0006 (0.0024) model time 0.5764 (0.5780) loss 7.2542 (8.4287) grad_norm 2.0012 (2.2195) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:15:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][470/625] eta 0:01:29 lr 0.001018 wd 0.0500 time 0.5748 (0.5803) data time 0.0007 (0.0024) model time 0.5742 (0.5781) loss 9.0709 (8.4246) grad_norm 1.6030 (2.2121) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:15:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][480/625] eta 0:01:24 lr 0.001018 wd 0.0500 time 0.5882 (0.5802) data time 0.0008 (0.0024) model time 0.5874 (0.5780) loss 8.2305 (8.4312) grad_norm 1.6483 (2.2071) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:15:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][490/625] eta 0:01:18 lr 0.001018 wd 0.0500 time 0.5721 (0.5801) data time 0.0009 (0.0024) model time 0.5712 (0.5780) loss 8.5598 (8.4354) grad_norm 1.9289 (2.2047) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:15:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][500/625] eta 0:01:12 lr 0.001017 wd 0.0500 time 0.5798 (0.5800) data time 0.0010 (0.0023) model time 0.5788 (0.5779) loss 9.0212 (8.4394) grad_norm 3.2648 (2.2041) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:16:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][510/625] eta 0:01:06 lr 0.001017 wd 0.0500 time 0.5734 (0.5800) data time 0.0008 (0.0023) model time 0.5726 (0.5778) loss 9.8891 (8.4347) grad_norm 3.1420 (2.2072) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:16:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][520/625] eta 0:01:00 lr 0.001017 wd 0.0500 time 0.5764 (0.5799) data time 0.0006 (0.0023) model time 0.5758 (0.5778) loss 8.3903 (8.4367) grad_norm 1.5783 (2.2090) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:16:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][530/625] eta 0:00:55 lr 0.001017 wd 0.0500 time 0.7016 (0.5800) data time 0.0006 (0.0022) model time 0.7010 (0.5780) loss 9.1298 (8.4421) grad_norm 2.4225 (2.2053) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:16:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][540/625] eta 0:00:49 lr 0.001017 wd 0.0500 time 0.5740 (0.5805) data time 0.0006 (0.0022) model time 0.5734 (0.5785) loss 10.1067 (8.4368) grad_norm 2.2326 (2.2025) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:16:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][550/625] eta 0:00:43 lr 0.001017 wd 0.0500 time 0.5732 (0.5804) data time 0.0008 (0.0022) model time 0.5724 (0.5785) loss 9.3183 (8.4432) grad_norm 1.6972 (2.2011) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:16:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][560/625] eta 0:00:37 lr 0.001017 wd 0.0500 time 0.5773 (0.5804) data time 0.0008 (0.0022) model time 0.5764 (0.5784) loss 9.6581 (8.4507) grad_norm 1.7796 (2.2002) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:16:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][570/625] eta 0:00:31 lr 0.001017 wd 0.0500 time 0.5766 (0.5803) data time 0.0008 (0.0022) model time 0.5758 (0.5783) loss 7.8488 (8.4460) grad_norm 1.6846 (2.2006) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:16:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][580/625] eta 0:00:26 lr 0.001017 wd 0.0500 time 0.5738 (0.5802) data time 0.0006 (0.0021) model time 0.5732 (0.5783) loss 7.7309 (8.4476) grad_norm 1.7434 (2.1982) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:16:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][590/625] eta 0:00:20 lr 0.001017 wd 0.0500 time 0.5712 (0.5801) data time 0.0006 (0.0021) model time 0.5706 (0.5782) loss 7.1181 (8.4457) grad_norm 1.9325 (2.1975) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:16:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][600/625] eta 0:00:14 lr 0.001017 wd 0.0500 time 0.5769 (0.5800) data time 0.0009 (0.0021) model time 0.5761 (0.5781) loss 7.0241 (8.4466) grad_norm 1.9734 (2.1927) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:17:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][610/625] eta 0:00:08 lr 0.001017 wd 0.0500 time 0.5761 (0.5800) data time 0.0004 (0.0021) model time 0.5757 (0.5781) loss 7.1700 (8.4422) grad_norm 2.9684 (2.1918) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:17:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [91/300][620/625] eta 0:00:02 lr 0.001017 wd 0.0500 time 0.6210 (0.5799) data time 0.0006 (0.0021) model time 0.6204 (0.5780) loss 8.7502 (8.4468) grad_norm 2.6165 (2.1948) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:17:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 91 training takes 0:06:02 +[2024-07-24 23:17:10 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-24 23:17:11 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-24 23:17:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.692 (0.692) Loss 0.5913 (0.5913) Acc@1 87.451 (87.451) Acc@5 98.389 (98.389) Mem 22339MB +[2024-07-24 23:17:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.177) Loss 0.9072 (0.7124) Acc@1 78.760 (84.584) Acc@5 95.752 (97.354) Mem 22339MB +[2024-07-24 23:17:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.153) Loss 1.0352 (0.8420) Acc@1 75.049 (81.115) Acc@5 94.141 (95.766) Mem 22339MB +[2024-07-24 23:17:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 80.828 Acc@5 95.771 +[2024-07-24 23:17:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 80.8% +[2024-07-24 23:17:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 80.83% +[2024-07-24 23:17:15 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-24 23:17:17 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-24 23:17:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.482 (0.482) Loss 0.5537 (0.5537) Acc@1 89.111 (89.111) Acc@5 98.486 (98.486) Mem 22339MB +[2024-07-24 23:17:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.8721 (0.6855) Acc@1 79.785 (85.684) Acc@5 95.898 (97.559) Mem 22339MB +[2024-07-24 23:17:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.143) Loss 0.9771 (0.7989) Acc@1 76.123 (82.375) Acc@5 94.922 (96.326) Mem 22339MB +[2024-07-24 23:17:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.104 Acc@5 96.351 +[2024-07-24 23:17:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.1% +[2024-07-24 23:17:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.10% +[2024-07-24 23:17:20 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-24 23:17:21 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-24 23:17:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][0/625] eta 0:12:24 lr 0.001016 wd 0.0500 time 1.1908 (1.1908) data time 0.6715 (0.6715) model time 0.0000 (0.0000) loss 8.4351 (8.4351) grad_norm 2.6164 (2.6164) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:17:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][10/625] eta 0:06:27 lr 0.001016 wd 0.0500 time 0.5728 (0.6302) data time 0.0008 (0.0618) model time 0.0000 (0.0000) loss 7.3275 (8.6118) grad_norm 2.3437 (2.3485) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:17:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][20/625] eta 0:06:05 lr 0.001016 wd 0.0500 time 0.5740 (0.6037) data time 0.0006 (0.0329) model time 0.0000 (0.0000) loss 6.8636 (8.5450) grad_norm 1.6587 (2.1968) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:17:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][30/625] eta 0:05:53 lr 0.001016 wd 0.0500 time 0.5886 (0.5943) data time 0.0006 (0.0225) model time 0.0000 (0.0000) loss 9.0449 (8.5620) grad_norm 2.5070 (2.3225) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:17:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][40/625] eta 0:05:45 lr 0.001016 wd 0.0500 time 0.5776 (0.5909) data time 0.0007 (0.0172) model time 0.0000 (0.0000) loss 8.7502 (8.5424) grad_norm 1.6773 (2.2963) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:17:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][50/625] eta 0:05:38 lr 0.001016 wd 0.0500 time 0.5714 (0.5879) data time 0.0006 (0.0140) model time 0.0000 (0.0000) loss 9.3206 (8.5053) grad_norm 2.0102 (2.2088) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:17:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][60/625] eta 0:05:30 lr 0.001016 wd 0.0500 time 0.5790 (0.5858) data time 0.0006 (0.0118) model time 0.5784 (0.5743) loss 6.7375 (8.4098) grad_norm 1.6442 (2.1712) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:18:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][70/625] eta 0:05:24 lr 0.001016 wd 0.0500 time 0.5753 (0.5843) data time 0.0008 (0.0103) model time 0.5745 (0.5741) loss 6.4648 (8.3713) grad_norm 1.6105 (2.1436) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:18:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][80/625] eta 0:05:17 lr 0.001016 wd 0.0500 time 0.5751 (0.5830) data time 0.0006 (0.0091) model time 0.5744 (0.5737) loss 8.6445 (8.3910) grad_norm 2.4002 (2.1184) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:18:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][90/625] eta 0:05:11 lr 0.001016 wd 0.0500 time 0.5727 (0.5820) data time 0.0006 (0.0082) model time 0.5721 (0.5736) loss 8.6571 (8.4454) grad_norm 2.1937 (2.1075) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:18:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][100/625] eta 0:05:05 lr 0.001016 wd 0.0500 time 0.5762 (0.5812) data time 0.0008 (0.0075) model time 0.5753 (0.5736) loss 8.7872 (8.4488) grad_norm 2.4167 (2.1039) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:18:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][110/625] eta 0:04:59 lr 0.001016 wd 0.0500 time 0.5761 (0.5807) data time 0.0009 (0.0069) model time 0.5752 (0.5738) loss 8.9399 (8.4415) grad_norm 1.7905 (2.0871) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:18:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][120/625] eta 0:04:53 lr 0.001016 wd 0.0500 time 0.5739 (0.5802) data time 0.0008 (0.0064) model time 0.5731 (0.5738) loss 8.9420 (8.4751) grad_norm 1.8054 (2.0947) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:18:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][130/625] eta 0:04:48 lr 0.001015 wd 0.0500 time 0.7376 (0.5837) data time 0.0008 (0.0060) model time 0.7369 (0.5801) loss 9.9152 (8.4815) grad_norm 2.1689 (2.1003) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:18:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][140/625] eta 0:04:43 lr 0.001015 wd 0.0500 time 0.5896 (0.5851) data time 0.0007 (0.0056) model time 0.5889 (0.5826) loss 8.2282 (8.4847) grad_norm 2.1079 (2.0816) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:18:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][150/625] eta 0:04:37 lr 0.001015 wd 0.0500 time 0.5771 (0.5844) data time 0.0009 (0.0053) model time 0.5762 (0.5817) loss 8.0854 (8.4783) grad_norm 2.4130 (2.0784) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:18:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][160/625] eta 0:04:31 lr 0.001015 wd 0.0500 time 0.5767 (0.5838) data time 0.0006 (0.0050) model time 0.5761 (0.5811) loss 9.4367 (8.5109) grad_norm 3.3503 (2.0819) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:19:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][170/625] eta 0:04:25 lr 0.001015 wd 0.0500 time 0.5776 (0.5833) data time 0.0008 (0.0048) model time 0.5768 (0.5806) loss 7.7986 (8.4927) grad_norm 2.2043 (2.0857) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:19:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][180/625] eta 0:04:19 lr 0.001015 wd 0.0500 time 0.5765 (0.5828) data time 0.0008 (0.0045) model time 0.5758 (0.5800) loss 9.1537 (8.4712) grad_norm 3.5335 (2.0869) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:19:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][190/625] eta 0:04:13 lr 0.001015 wd 0.0500 time 0.5766 (0.5824) data time 0.0007 (0.0043) model time 0.5760 (0.5796) loss 7.5878 (8.4466) grad_norm 2.1264 (2.0901) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:19:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][200/625] eta 0:04:07 lr 0.001015 wd 0.0500 time 0.5726 (0.5826) data time 0.0010 (0.0042) model time 0.5716 (0.5800) loss 8.9041 (8.4498) grad_norm 2.0511 (2.0815) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:19:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][210/625] eta 0:04:01 lr 0.001015 wd 0.0500 time 0.5744 (0.5822) data time 0.0009 (0.0040) model time 0.5736 (0.5796) loss 9.0550 (8.4652) grad_norm 1.5592 (2.0746) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:19:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][220/625] eta 0:03:55 lr 0.001015 wd 0.0500 time 0.5738 (0.5819) data time 0.0008 (0.0039) model time 0.5730 (0.5793) loss 6.6822 (8.4479) grad_norm 2.1073 (2.0761) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:19:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][230/625] eta 0:03:49 lr 0.001015 wd 0.0500 time 0.5772 (0.5816) data time 0.0007 (0.0037) model time 0.5765 (0.5790) loss 8.2395 (8.4283) grad_norm 3.2196 (2.0808) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:19:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][240/625] eta 0:03:43 lr 0.001015 wd 0.0500 time 0.5725 (0.5813) data time 0.0009 (0.0036) model time 0.5716 (0.5787) loss 7.7777 (8.3961) grad_norm 1.8547 (2.0741) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:19:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][250/625] eta 0:03:37 lr 0.001015 wd 0.0500 time 0.5718 (0.5810) data time 0.0009 (0.0035) model time 0.5709 (0.5784) loss 9.1477 (8.3940) grad_norm 2.4543 (2.0823) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:19:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][260/625] eta 0:03:32 lr 0.001014 wd 0.0500 time 0.5755 (0.5808) data time 0.0008 (0.0034) model time 0.5747 (0.5783) loss 8.6209 (8.3967) grad_norm 2.5894 (2.0967) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:19:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][270/625] eta 0:03:26 lr 0.001014 wd 0.0500 time 0.5752 (0.5806) data time 0.0006 (0.0033) model time 0.5747 (0.5781) loss 9.0170 (8.3942) grad_norm 1.8420 (2.0890) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:20:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][280/625] eta 0:03:20 lr 0.001014 wd 0.0500 time 0.5757 (0.5804) data time 0.0006 (0.0032) model time 0.5751 (0.5779) loss 6.2971 (8.3823) grad_norm 1.9273 (2.0760) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:20:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][290/625] eta 0:03:14 lr 0.001014 wd 0.0500 time 0.5759 (0.5803) data time 0.0006 (0.0031) model time 0.5753 (0.5778) loss 6.8308 (8.3869) grad_norm 2.7685 (2.0837) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:20:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][300/625] eta 0:03:08 lr 0.001014 wd 0.0500 time 0.5732 (0.5801) data time 0.0008 (0.0031) model time 0.5724 (0.5777) loss 8.3712 (8.3820) grad_norm 1.8442 (2.0928) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:20:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][310/625] eta 0:03:02 lr 0.001014 wd 0.0500 time 0.5740 (0.5799) data time 0.0007 (0.0030) model time 0.5733 (0.5775) loss 6.6239 (8.3704) grad_norm 2.5887 (2.0982) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:20:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][320/625] eta 0:02:56 lr 0.001014 wd 0.0500 time 0.5726 (0.5797) data time 0.0006 (0.0029) model time 0.5720 (0.5773) loss 8.3804 (8.3584) grad_norm 2.0671 (2.1052) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:20:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][330/625] eta 0:02:50 lr 0.001014 wd 0.0500 time 0.5762 (0.5796) data time 0.0008 (0.0029) model time 0.5755 (0.5772) loss 7.4534 (8.3548) grad_norm 2.1139 (2.1025) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:20:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][340/625] eta 0:02:45 lr 0.001014 wd 0.0500 time 0.5720 (0.5794) data time 0.0010 (0.0028) model time 0.5710 (0.5771) loss 8.4991 (8.3501) grad_norm 1.5807 (2.1023) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:20:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][350/625] eta 0:02:39 lr 0.001014 wd 0.0500 time 0.7298 (0.5805) data time 0.0006 (0.0027) model time 0.7292 (0.5784) loss 9.0188 (8.3600) grad_norm 2.1072 (2.1022) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:20:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][360/625] eta 0:02:34 lr 0.001014 wd 0.0500 time 0.5746 (0.5813) data time 0.0008 (0.0027) model time 0.5738 (0.5794) loss 7.5022 (8.3608) grad_norm 1.5874 (2.0992) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:20:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][370/625] eta 0:02:28 lr 0.001014 wd 0.0500 time 0.5780 (0.5812) data time 0.0008 (0.0026) model time 0.5772 (0.5793) loss 8.1236 (8.3534) grad_norm 1.6750 (2.0976) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:21:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][380/625] eta 0:02:22 lr 0.001014 wd 0.0500 time 0.5758 (0.5810) data time 0.0008 (0.0026) model time 0.5750 (0.5791) loss 9.5402 (8.3629) grad_norm 3.2537 (2.0943) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:21:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][390/625] eta 0:02:16 lr 0.001013 wd 0.0500 time 0.5776 (0.5809) data time 0.0007 (0.0025) model time 0.5769 (0.5790) loss 8.8240 (8.3627) grad_norm 1.8130 (2.0915) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:21:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][400/625] eta 0:02:10 lr 0.001013 wd 0.0500 time 0.5768 (0.5808) data time 0.0008 (0.0025) model time 0.5760 (0.5789) loss 7.7418 (8.3486) grad_norm 1.4771 (2.0893) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:21:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][410/625] eta 0:02:04 lr 0.001013 wd 0.0500 time 0.5765 (0.5807) data time 0.0008 (0.0025) model time 0.5757 (0.5788) loss 7.6358 (8.3362) grad_norm 1.9800 (2.0890) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:21:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][420/625] eta 0:01:59 lr 0.001013 wd 0.0500 time 0.5756 (0.5809) data time 0.0009 (0.0024) model time 0.5747 (0.5791) loss 9.0243 (8.3348) grad_norm 2.2614 (2.0872) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:21:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][430/625] eta 0:01:53 lr 0.001013 wd 0.0500 time 0.5716 (0.5808) data time 0.0009 (0.0024) model time 0.5707 (0.5790) loss 9.0183 (8.3309) grad_norm 1.8431 (2.0833) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:21:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][440/625] eta 0:01:47 lr 0.001013 wd 0.0500 time 0.5765 (0.5807) data time 0.0007 (0.0023) model time 0.5758 (0.5789) loss 7.7171 (8.3276) grad_norm 1.6869 (2.0753) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:21:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][450/625] eta 0:01:41 lr 0.001013 wd 0.0500 time 0.5730 (0.5806) data time 0.0006 (0.0023) model time 0.5724 (0.5788) loss 8.9649 (8.3387) grad_norm 2.5330 (2.0765) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:21:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][460/625] eta 0:01:35 lr 0.001013 wd 0.0500 time 0.5774 (0.5805) data time 0.0006 (0.0023) model time 0.5768 (0.5788) loss 7.7704 (8.3368) grad_norm 2.5149 (2.0751) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:21:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][470/625] eta 0:01:29 lr 0.001013 wd 0.0500 time 0.5784 (0.5804) data time 0.0006 (0.0022) model time 0.5778 (0.5786) loss 6.7748 (8.3358) grad_norm 1.4609 (2.0766) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:22:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][480/625] eta 0:01:24 lr 0.001013 wd 0.0500 time 0.5739 (0.5802) data time 0.0008 (0.0022) model time 0.5730 (0.5785) loss 9.1008 (8.3424) grad_norm 2.3072 (2.0744) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:22:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][490/625] eta 0:01:18 lr 0.001013 wd 0.0500 time 0.5731 (0.5801) data time 0.0008 (0.0022) model time 0.5722 (0.5784) loss 7.9416 (8.3403) grad_norm 3.3662 (2.0765) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:22:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][500/625] eta 0:01:12 lr 0.001013 wd 0.0500 time 0.5725 (0.5800) data time 0.0008 (0.0022) model time 0.5717 (0.5783) loss 8.2379 (8.3360) grad_norm 1.9028 (2.0762) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:22:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][510/625] eta 0:01:06 lr 0.001013 wd 0.0500 time 0.5680 (0.5799) data time 0.0008 (0.0021) model time 0.5673 (0.5782) loss 9.1655 (8.3390) grad_norm 2.1165 (2.0767) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:22:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][520/625] eta 0:01:00 lr 0.001012 wd 0.0500 time 0.5801 (0.5798) data time 0.0008 (0.0021) model time 0.5793 (0.5781) loss 7.8073 (8.3418) grad_norm 2.2015 (2.0764) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:22:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][530/625] eta 0:00:55 lr 0.001012 wd 0.0500 time 0.5673 (0.5797) data time 0.0007 (0.0021) model time 0.5667 (0.5780) loss 8.9596 (8.3429) grad_norm 1.5244 (2.0743) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:22:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][540/625] eta 0:00:49 lr 0.001012 wd 0.0500 time 0.5817 (0.5796) data time 0.0007 (0.0021) model time 0.5810 (0.5779) loss 8.3952 (8.3413) grad_norm 1.6941 (2.0750) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:22:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][550/625] eta 0:00:43 lr 0.001012 wd 0.0500 time 0.5787 (0.5796) data time 0.0007 (0.0020) model time 0.5780 (0.5780) loss 8.0426 (8.3465) grad_norm 1.6181 (2.0743) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:22:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][560/625] eta 0:00:37 lr 0.001012 wd 0.0500 time 0.5734 (0.5795) data time 0.0007 (0.0020) model time 0.5727 (0.5779) loss 6.4582 (8.3377) grad_norm 1.9102 (2.0731) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:22:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][570/625] eta 0:00:31 lr 0.001012 wd 0.0500 time 0.5721 (0.5800) data time 0.0009 (0.0020) model time 0.5712 (0.5784) loss 9.3984 (8.3450) grad_norm 1.7697 (2.0669) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:22:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][580/625] eta 0:00:26 lr 0.001012 wd 0.0500 time 0.5753 (0.5806) data time 0.0007 (0.0020) model time 0.5746 (0.5791) loss 8.0897 (8.3498) grad_norm 2.2667 (2.0677) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:23:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][590/625] eta 0:00:20 lr 0.001012 wd 0.0500 time 0.5794 (0.5806) data time 0.0007 (0.0019) model time 0.5787 (0.5791) loss 7.4076 (8.3472) grad_norm 2.1549 (2.0722) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:23:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][600/625] eta 0:00:14 lr 0.001012 wd 0.0500 time 0.5761 (0.5805) data time 0.0006 (0.0019) model time 0.5756 (0.5790) loss 6.2328 (8.3439) grad_norm 2.4875 (2.0796) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:23:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][610/625] eta 0:00:08 lr 0.001012 wd 0.0500 time 0.5793 (0.5804) data time 0.0004 (0.0019) model time 0.5789 (0.5789) loss 8.6651 (8.3461) grad_norm 2.5213 (2.0857) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:23:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [92/300][620/625] eta 0:00:02 lr 0.001012 wd 0.0500 time 0.5750 (0.5803) data time 0.0005 (0.0019) model time 0.5745 (0.5788) loss 8.6945 (8.3397) grad_norm 2.4781 (2.0841) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:23:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 92 training takes 0:06:02 +[2024-07-24 23:23:24 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-24 23:23:26 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-24 23:23:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 2.544 (2.544) Loss 0.5449 (0.5449) Acc@1 88.525 (88.525) Acc@5 98.535 (98.535) Mem 22339MB +[2024-07-24 23:23:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.345) Loss 0.9106 (0.6945) Acc@1 78.467 (84.668) Acc@5 95.410 (97.332) Mem 22339MB +[2024-07-24 23:23:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.241) Loss 1.0117 (0.8202) Acc@1 75.342 (81.310) Acc@5 94.287 (95.903) Mem 22339MB +[2024-07-24 23:23:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 80.984 Acc@5 95.843 +[2024-07-24 23:23:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.0% +[2024-07-24 23:23:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 80.98% +[2024-07-24 23:23:31 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-24 23:23:32 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-24 23:23:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 1.962 (1.962) Loss 0.5503 (0.5503) Acc@1 89.209 (89.209) Acc@5 98.535 (98.535) Mem 22339MB +[2024-07-24 23:23:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.292) Loss 0.8682 (0.6820) Acc@1 79.932 (85.720) Acc@5 95.850 (97.554) Mem 22339MB +[2024-07-24 23:23:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.213) Loss 0.9731 (0.7956) Acc@1 76.318 (82.422) Acc@5 94.922 (96.324) Mem 22339MB +[2024-07-24 23:23:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.144 Acc@5 96.343 +[2024-07-24 23:23:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.1% +[2024-07-24 23:23:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.14% +[2024-07-24 23:23:37 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-24 23:23:39 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-24 23:23:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][0/625] eta 0:12:59 lr 0.001012 wd 0.0500 time 1.2472 (1.2472) data time 0.7178 (0.7178) model time 0.0000 (0.0000) loss 9.8698 (9.8698) grad_norm 2.2557 (2.2557) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:23:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][10/625] eta 0:06:30 lr 0.001012 wd 0.0500 time 0.5725 (0.6344) data time 0.0006 (0.0661) model time 0.0000 (0.0000) loss 8.0554 (8.5380) grad_norm 1.4103 (1.9404) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:23:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][20/625] eta 0:06:07 lr 0.001011 wd 0.0500 time 0.5752 (0.6072) data time 0.0008 (0.0350) model time 0.0000 (0.0000) loss 8.9815 (8.2548) grad_norm 2.3017 (1.9026) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:23:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][30/625] eta 0:05:55 lr 0.001011 wd 0.0500 time 0.5756 (0.5971) data time 0.0008 (0.0240) model time 0.0000 (0.0000) loss 9.2061 (8.2204) grad_norm 1.5438 (1.8796) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:24:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][40/625] eta 0:05:46 lr 0.001011 wd 0.0500 time 0.5738 (0.5916) data time 0.0006 (0.0183) model time 0.0000 (0.0000) loss 7.9412 (8.1543) grad_norm 1.5861 (1.8635) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:24:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][50/625] eta 0:05:38 lr 0.001011 wd 0.0500 time 0.5741 (0.5880) data time 0.0006 (0.0149) model time 0.0000 (0.0000) loss 6.7931 (8.0806) grad_norm 2.5618 (1.8703) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:24:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][60/625] eta 0:05:31 lr 0.001011 wd 0.0500 time 0.5783 (0.5859) data time 0.0006 (0.0126) model time 0.5778 (0.5745) loss 8.2925 (8.1124) grad_norm 1.6487 (1.8873) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:24:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][70/625] eta 0:05:24 lr 0.001011 wd 0.0500 time 0.5752 (0.5845) data time 0.0006 (0.0109) model time 0.5747 (0.5750) loss 7.3877 (8.0864) grad_norm 1.7296 (1.9071) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:24:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][80/625] eta 0:05:18 lr 0.001011 wd 0.0500 time 0.5719 (0.5835) data time 0.0007 (0.0097) model time 0.5711 (0.5751) loss 9.1484 (8.0570) grad_norm 2.0016 (1.9075) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:24:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][90/625] eta 0:05:11 lr 0.001011 wd 0.0500 time 0.5766 (0.5827) data time 0.0008 (0.0087) model time 0.5759 (0.5751) loss 7.1434 (8.0888) grad_norm 1.9153 (1.9193) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:24:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][100/625] eta 0:05:05 lr 0.001011 wd 0.0500 time 0.5758 (0.5819) data time 0.0008 (0.0079) model time 0.5750 (0.5749) loss 8.2444 (8.1064) grad_norm 3.1335 (1.9688) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:24:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][110/625] eta 0:04:59 lr 0.001011 wd 0.0500 time 0.5748 (0.5813) data time 0.0009 (0.0073) model time 0.5740 (0.5747) loss 7.2370 (8.0681) grad_norm 3.0976 (1.9751) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:24:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][120/625] eta 0:04:53 lr 0.001011 wd 0.0500 time 0.5759 (0.5807) data time 0.0006 (0.0068) model time 0.5753 (0.5745) loss 8.9911 (8.1021) grad_norm 2.6270 (2.0106) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:24:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][130/625] eta 0:04:47 lr 0.001011 wd 0.0500 time 0.5717 (0.5802) data time 0.0006 (0.0063) model time 0.5711 (0.5744) loss 8.2375 (8.1123) grad_norm 1.6822 (1.9997) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:25:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][140/625] eta 0:04:41 lr 0.001011 wd 0.0500 time 0.7843 (0.5813) data time 0.0007 (0.0059) model time 0.7836 (0.5766) loss 7.2193 (8.1161) grad_norm 1.6348 (1.9926) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:25:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][150/625] eta 0:04:35 lr 0.001010 wd 0.0500 time 0.5776 (0.5805) data time 0.0008 (0.0056) model time 0.5767 (0.5758) loss 9.7311 (8.1456) grad_norm 1.6222 (1.9765) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:25:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][160/625] eta 0:04:29 lr 0.001010 wd 0.0500 time 0.5689 (0.5802) data time 0.0009 (0.0053) model time 0.5680 (0.5757) loss 9.5037 (8.1760) grad_norm 3.5940 (1.9941) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:25:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][170/625] eta 0:04:26 lr 0.001010 wd 0.0500 time 0.7565 (0.5848) data time 0.0007 (0.0050) model time 0.7558 (0.5827) loss 7.9117 (8.1900) grad_norm 1.7036 (2.0021) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:25:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][180/625] eta 0:04:20 lr 0.001010 wd 0.0500 time 0.5878 (0.5851) data time 0.0006 (0.0048) model time 0.5872 (0.5831) loss 6.9409 (8.1925) grad_norm 1.6933 (1.9990) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:25:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][190/625] eta 0:04:14 lr 0.001010 wd 0.0500 time 0.5724 (0.5845) data time 0.0006 (0.0046) model time 0.5717 (0.5825) loss 7.6072 (8.1817) grad_norm 2.5814 (2.0105) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:25:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][200/625] eta 0:04:08 lr 0.001010 wd 0.0500 time 0.5759 (0.5841) data time 0.0006 (0.0044) model time 0.5753 (0.5820) loss 8.6060 (8.2069) grad_norm 2.1258 (2.0077) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:25:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][210/625] eta 0:04:02 lr 0.001010 wd 0.0500 time 0.5723 (0.5837) data time 0.0008 (0.0042) model time 0.5715 (0.5816) loss 8.2022 (8.1868) grad_norm 2.0994 (2.0281) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:25:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][220/625] eta 0:03:56 lr 0.001010 wd 0.0500 time 0.5857 (0.5834) data time 0.0006 (0.0041) model time 0.5851 (0.5812) loss 8.4755 (8.2083) grad_norm 2.7540 (2.0348) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:25:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][230/625] eta 0:03:50 lr 0.001010 wd 0.0500 time 0.5895 (0.5831) data time 0.0006 (0.0039) model time 0.5888 (0.5809) loss 9.1611 (8.2124) grad_norm 2.1164 (2.0325) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:25:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][240/625] eta 0:03:44 lr 0.001010 wd 0.0500 time 0.5706 (0.5827) data time 0.0006 (0.0038) model time 0.5700 (0.5805) loss 9.1842 (8.2241) grad_norm 1.9974 (2.0214) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:26:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][250/625] eta 0:03:38 lr 0.001010 wd 0.0500 time 0.5776 (0.5824) data time 0.0009 (0.0037) model time 0.5768 (0.5802) loss 9.2693 (8.2244) grad_norm 2.2982 (2.0190) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:26:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][260/625] eta 0:03:32 lr 0.001010 wd 0.0500 time 0.5788 (0.5822) data time 0.0008 (0.0036) model time 0.5780 (0.5799) loss 8.2353 (8.2271) grad_norm 1.4468 (2.0090) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:26:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][270/625] eta 0:03:26 lr 0.001010 wd 0.0500 time 0.5735 (0.5819) data time 0.0008 (0.0035) model time 0.5727 (0.5797) loss 8.7764 (8.2292) grad_norm 2.1541 (2.0088) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:26:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][280/625] eta 0:03:20 lr 0.001009 wd 0.0500 time 0.5740 (0.5817) data time 0.0008 (0.0034) model time 0.5732 (0.5794) loss 8.7388 (8.2391) grad_norm 1.8567 (2.0100) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:26:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][290/625] eta 0:03:14 lr 0.001009 wd 0.0500 time 0.5786 (0.5815) data time 0.0006 (0.0033) model time 0.5780 (0.5793) loss 9.2936 (8.2417) grad_norm 1.9227 (2.0091) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:26:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][300/625] eta 0:03:08 lr 0.001009 wd 0.0500 time 0.5751 (0.5813) data time 0.0008 (0.0032) model time 0.5743 (0.5791) loss 9.0556 (8.2334) grad_norm 1.6743 (2.0055) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:26:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][310/625] eta 0:03:03 lr 0.001009 wd 0.0500 time 0.5766 (0.5811) data time 0.0006 (0.0031) model time 0.5760 (0.5789) loss 9.0478 (8.2351) grad_norm 1.6856 (1.9986) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:26:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][320/625] eta 0:02:57 lr 0.001009 wd 0.0500 time 0.5752 (0.5809) data time 0.0006 (0.0031) model time 0.5746 (0.5788) loss 6.9253 (8.2388) grad_norm 2.4997 (1.9964) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:26:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][330/625] eta 0:02:51 lr 0.001009 wd 0.0500 time 0.5734 (0.5807) data time 0.0009 (0.0030) model time 0.5725 (0.5786) loss 8.0394 (8.2316) grad_norm 1.8276 (1.9928) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:26:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][340/625] eta 0:02:45 lr 0.001009 wd 0.0500 time 0.5741 (0.5805) data time 0.0008 (0.0029) model time 0.5733 (0.5784) loss 7.5300 (8.2239) grad_norm 3.4315 (1.9964) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:27:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][350/625] eta 0:02:39 lr 0.001009 wd 0.0500 time 0.5752 (0.5803) data time 0.0007 (0.0029) model time 0.5745 (0.5782) loss 9.3254 (8.2152) grad_norm 1.6204 (2.0007) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:27:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][360/625] eta 0:02:33 lr 0.001009 wd 0.0500 time 0.5735 (0.5802) data time 0.0008 (0.0028) model time 0.5727 (0.5781) loss 8.3129 (8.2120) grad_norm 2.0956 (2.0022) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:27:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][370/625] eta 0:02:27 lr 0.001009 wd 0.0500 time 0.5730 (0.5802) data time 0.0008 (0.0028) model time 0.5723 (0.5781) loss 7.7095 (8.2031) grad_norm 1.8519 (2.0050) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:27:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][380/625] eta 0:02:22 lr 0.001009 wd 0.0500 time 0.7452 (0.5805) data time 0.0005 (0.0027) model time 0.7446 (0.5785) loss 8.4126 (8.2038) grad_norm 1.6041 (2.0021) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:27:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][390/625] eta 0:02:16 lr 0.001009 wd 0.0500 time 0.5741 (0.5822) data time 0.0008 (0.0027) model time 0.5733 (0.5806) loss 8.4432 (8.2072) grad_norm 1.4209 (2.0028) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:27:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][400/625] eta 0:02:11 lr 0.001009 wd 0.0500 time 0.5830 (0.5824) data time 0.0006 (0.0026) model time 0.5824 (0.5807) loss 7.7337 (8.2116) grad_norm 1.9063 (2.0095) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:27:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][410/625] eta 0:02:05 lr 0.001008 wd 0.0500 time 0.5742 (0.5822) data time 0.0006 (0.0026) model time 0.5736 (0.5805) loss 9.4684 (8.2097) grad_norm 1.7583 (2.0104) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:27:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][420/625] eta 0:01:59 lr 0.001008 wd 0.0500 time 0.5968 (0.5821) data time 0.0007 (0.0025) model time 0.5960 (0.5804) loss 9.3271 (8.2046) grad_norm 2.0533 (2.0232) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:27:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][430/625] eta 0:01:53 lr 0.001008 wd 0.0500 time 0.5774 (0.5819) data time 0.0008 (0.0025) model time 0.5766 (0.5802) loss 7.4603 (8.1993) grad_norm 3.0763 (2.0334) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:27:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][440/625] eta 0:01:47 lr 0.001008 wd 0.0500 time 0.5723 (0.5817) data time 0.0006 (0.0024) model time 0.5717 (0.5801) loss 8.2487 (8.2000) grad_norm 1.6786 (2.0335) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:28:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][450/625] eta 0:01:41 lr 0.001008 wd 0.0500 time 0.5829 (0.5816) data time 0.0008 (0.0024) model time 0.5821 (0.5799) loss 9.0250 (8.2108) grad_norm 2.2616 (2.0287) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:28:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][460/625] eta 0:01:35 lr 0.001008 wd 0.0500 time 0.5913 (0.5815) data time 0.0006 (0.0024) model time 0.5907 (0.5798) loss 9.6622 (8.2224) grad_norm 2.6412 (2.0351) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:28:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][470/625] eta 0:01:30 lr 0.001008 wd 0.0500 time 0.5773 (0.5813) data time 0.0009 (0.0023) model time 0.5764 (0.5797) loss 8.4786 (8.2202) grad_norm 1.8742 (2.0383) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:28:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][480/625] eta 0:01:24 lr 0.001008 wd 0.0500 time 0.5790 (0.5812) data time 0.0008 (0.0023) model time 0.5782 (0.5795) loss 8.1456 (8.2229) grad_norm 1.5485 (2.0378) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:28:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][490/625] eta 0:01:18 lr 0.001008 wd 0.0500 time 0.5750 (0.5810) data time 0.0007 (0.0023) model time 0.5743 (0.5794) loss 9.5656 (8.2233) grad_norm 2.6328 (2.0379) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:28:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][500/625] eta 0:01:12 lr 0.001008 wd 0.0500 time 0.5776 (0.5809) data time 0.0008 (0.0023) model time 0.5768 (0.5793) loss 7.7399 (8.2161) grad_norm 2.8314 (2.0408) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:28:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][510/625] eta 0:01:06 lr 0.001008 wd 0.0500 time 0.5758 (0.5808) data time 0.0007 (0.0022) model time 0.5751 (0.5792) loss 9.5621 (8.2203) grad_norm 1.6281 (2.0398) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:28:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][520/625] eta 0:01:00 lr 0.001008 wd 0.0500 time 0.5737 (0.5807) data time 0.0008 (0.0022) model time 0.5729 (0.5791) loss 9.7235 (8.2185) grad_norm 1.5990 (2.0375) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:28:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][530/625] eta 0:00:55 lr 0.001008 wd 0.0500 time 0.5762 (0.5806) data time 0.0006 (0.0022) model time 0.5756 (0.5790) loss 10.0901 (8.2292) grad_norm 1.9645 (2.0338) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:28:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][540/625] eta 0:00:49 lr 0.001007 wd 0.0500 time 0.5779 (0.5805) data time 0.0006 (0.0021) model time 0.5773 (0.5789) loss 6.6676 (8.2230) grad_norm 1.9699 (2.0335) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:28:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][550/625] eta 0:00:43 lr 0.001007 wd 0.0500 time 0.5737 (0.5804) data time 0.0006 (0.0021) model time 0.5731 (0.5788) loss 6.8496 (8.2203) grad_norm 2.2129 (2.0381) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:29:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][560/625] eta 0:00:37 lr 0.001007 wd 0.0500 time 0.5775 (0.5803) data time 0.0008 (0.0021) model time 0.5767 (0.5787) loss 9.0556 (8.2256) grad_norm 1.9869 (2.0357) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:29:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][570/625] eta 0:00:31 lr 0.001007 wd 0.0500 time 0.5712 (0.5802) data time 0.0007 (0.0021) model time 0.5705 (0.5786) loss 7.0727 (8.2236) grad_norm 1.8405 (2.0384) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:29:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][580/625] eta 0:00:26 lr 0.001007 wd 0.0500 time 0.5754 (0.5801) data time 0.0006 (0.0021) model time 0.5748 (0.5785) loss 8.3205 (8.2320) grad_norm 2.8971 (2.0402) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:29:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][590/625] eta 0:00:20 lr 0.001007 wd 0.0500 time 0.5752 (0.5802) data time 0.0006 (0.0020) model time 0.5746 (0.5787) loss 7.0798 (8.2239) grad_norm 3.0793 (2.0462) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:29:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][600/625] eta 0:00:14 lr 0.001007 wd 0.0500 time 0.5834 (0.5802) data time 0.0006 (0.0020) model time 0.5828 (0.5787) loss 7.4618 (8.2288) grad_norm 1.6128 (2.0512) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:29:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][610/625] eta 0:00:08 lr 0.001007 wd 0.0500 time 0.5695 (0.5811) data time 0.0004 (0.0020) model time 0.5691 (0.5796) loss 8.6739 (8.2306) grad_norm 1.6073 (2.0484) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:29:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [93/300][620/625] eta 0:00:02 lr 0.001007 wd 0.0500 time 0.5732 (0.5815) data time 0.0006 (0.0020) model time 0.5727 (0.5801) loss 7.3618 (8.2294) grad_norm 2.6067 (2.0493) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:29:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 93 training takes 0:06:03 +[2024-07-24 23:29:42 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-24 23:29:43 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-24 23:29:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.587 (0.587) Loss 0.5527 (0.5527) Acc@1 88.721 (88.721) Acc@5 98.389 (98.389) Mem 22339MB +[2024-07-24 23:29:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.168) Loss 0.8872 (0.6774) Acc@1 78.613 (85.014) Acc@5 95.459 (97.390) Mem 22339MB +[2024-07-24 23:29:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.148) Loss 0.9868 (0.8071) Acc@1 76.172 (81.557) Acc@5 94.092 (95.929) Mem 22339MB +[2024-07-24 23:29:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.216 Acc@5 95.919 +[2024-07-24 23:29:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.2% +[2024-07-24 23:29:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 81.22% +[2024-07-24 23:29:47 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-24 23:29:48 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-24 23:29:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.508 (0.508) Loss 0.5464 (0.5464) Acc@1 89.111 (89.111) Acc@5 98.535 (98.535) Mem 22339MB +[2024-07-24 23:29:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.160) Loss 0.8652 (0.6788) Acc@1 79.834 (85.680) Acc@5 95.947 (97.581) Mem 22339MB +[2024-07-24 23:29:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.144) Loss 0.9712 (0.7923) Acc@1 76.172 (82.396) Acc@5 94.922 (96.350) Mem 22339MB +[2024-07-24 23:29:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.126 Acc@5 96.367 +[2024-07-24 23:29:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.1% +[2024-07-24 23:29:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][0/625] eta 0:16:55 lr 0.001007 wd 0.0500 time 1.6254 (1.6254) data time 0.7013 (0.7013) model time 0.0000 (0.0000) loss 9.1067 (9.1067) grad_norm 1.7040 (1.7040) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:29:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][10/625] eta 0:06:51 lr 0.001007 wd 0.0500 time 0.5738 (0.6693) data time 0.0007 (0.0645) model time 0.0000 (0.0000) loss 9.1889 (8.2520) grad_norm 1.9484 (2.0511) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:30:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][20/625] eta 0:06:17 lr 0.001007 wd 0.0500 time 0.5748 (0.6243) data time 0.0008 (0.0342) model time 0.0000 (0.0000) loss 8.7250 (8.3397) grad_norm 1.8446 (2.0154) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:30:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][30/625] eta 0:06:01 lr 0.001007 wd 0.0500 time 0.5761 (0.6083) data time 0.0008 (0.0234) model time 0.0000 (0.0000) loss 8.5561 (8.4121) grad_norm 1.8564 (2.0403) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:30:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][40/625] eta 0:05:51 lr 0.001006 wd 0.0500 time 0.5756 (0.6000) data time 0.0006 (0.0179) model time 0.0000 (0.0000) loss 6.9799 (8.3343) grad_norm 2.1139 (1.9961) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:30:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][50/625] eta 0:05:42 lr 0.001006 wd 0.0500 time 0.5734 (0.5950) data time 0.0008 (0.0145) model time 0.0000 (0.0000) loss 8.5222 (8.3266) grad_norm 2.1144 (2.0042) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:30:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][60/625] eta 0:05:34 lr 0.001006 wd 0.0500 time 0.5723 (0.5916) data time 0.0006 (0.0123) model time 0.5718 (0.5733) loss 8.0991 (8.4084) grad_norm 1.9040 (1.9873) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:30:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][70/625] eta 0:05:26 lr 0.001006 wd 0.0500 time 0.5703 (0.5890) data time 0.0007 (0.0107) model time 0.5695 (0.5727) loss 8.4020 (8.4152) grad_norm 1.6829 (1.9713) loss_scale 8192.0000 (4326.7606) mem 22339MB +[2024-07-24 23:30:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][80/625] eta 0:05:20 lr 0.001006 wd 0.0500 time 0.5764 (0.5872) data time 0.0007 (0.0095) model time 0.5757 (0.5730) loss 8.9508 (8.3991) grad_norm 1.8192 (1.9772) loss_scale 8192.0000 (4803.9506) mem 22339MB +[2024-07-24 23:30:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][90/625] eta 0:05:13 lr 0.001006 wd 0.0500 time 0.5747 (0.5859) data time 0.0006 (0.0085) model time 0.5741 (0.5735) loss 8.2070 (8.3665) grad_norm 2.2274 (1.9657) loss_scale 8192.0000 (5176.2637) mem 22339MB +[2024-07-24 23:30:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][100/625] eta 0:05:07 lr 0.001006 wd 0.0500 time 0.5706 (0.5850) data time 0.0007 (0.0077) model time 0.5699 (0.5739) loss 7.8896 (8.3579) grad_norm 1.9017 (1.9745) loss_scale 8192.0000 (5474.8515) mem 22339MB +[2024-07-24 23:30:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][110/625] eta 0:05:00 lr 0.001006 wd 0.0500 time 0.5674 (0.5842) data time 0.0007 (0.0071) model time 0.5667 (0.5742) loss 8.4768 (8.3002) grad_norm 1.5967 (1.9826) loss_scale 8192.0000 (5719.6396) mem 22339MB +[2024-07-24 23:31:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][120/625] eta 0:04:55 lr 0.001006 wd 0.0500 time 0.5687 (0.5842) data time 0.0008 (0.0066) model time 0.5679 (0.5755) loss 7.6167 (8.2902) grad_norm 2.4140 (2.0546) loss_scale 8192.0000 (5923.9669) mem 22339MB +[2024-07-24 23:31:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][130/625] eta 0:04:48 lr 0.001006 wd 0.0500 time 0.5695 (0.5835) data time 0.0008 (0.0062) model time 0.5687 (0.5754) loss 8.0886 (8.2627) grad_norm 1.9265 (2.0821) loss_scale 8192.0000 (6097.0992) mem 22339MB +[2024-07-24 23:31:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][140/625] eta 0:04:42 lr 0.001006 wd 0.0500 time 0.5720 (0.5829) data time 0.0006 (0.0058) model time 0.5713 (0.5752) loss 8.6829 (8.2653) grad_norm 1.5608 (2.0624) loss_scale 8192.0000 (6245.6738) mem 22339MB +[2024-07-24 23:31:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][150/625] eta 0:04:36 lr 0.001006 wd 0.0500 time 0.5742 (0.5825) data time 0.0008 (0.0055) model time 0.5734 (0.5754) loss 6.2995 (8.2678) grad_norm 1.7549 (2.0436) loss_scale 8192.0000 (6374.5695) mem 22339MB +[2024-07-24 23:31:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][160/625] eta 0:04:30 lr 0.001005 wd 0.0500 time 0.5719 (0.5821) data time 0.0008 (0.0052) model time 0.5711 (0.5753) loss 8.3331 (8.2751) grad_norm 2.5604 (2.0407) loss_scale 8192.0000 (6487.4534) mem 22339MB +[2024-07-24 23:31:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][170/625] eta 0:04:24 lr 0.001005 wd 0.0500 time 0.5768 (0.5818) data time 0.0008 (0.0049) model time 0.5760 (0.5753) loss 9.2354 (8.2831) grad_norm 1.6437 (2.0315) loss_scale 8192.0000 (6587.1345) mem 22339MB +[2024-07-24 23:31:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][180/625] eta 0:04:18 lr 0.001005 wd 0.0500 time 0.5767 (0.5815) data time 0.0006 (0.0047) model time 0.5761 (0.5753) loss 8.8894 (8.2832) grad_norm 1.7519 (2.0283) loss_scale 8192.0000 (6675.8011) mem 22339MB +[2024-07-24 23:31:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][190/625] eta 0:04:12 lr 0.001005 wd 0.0500 time 0.5761 (0.5812) data time 0.0009 (0.0045) model time 0.5752 (0.5753) loss 9.5611 (8.3042) grad_norm 1.8527 (2.0309) loss_scale 8192.0000 (6755.1832) mem 22339MB +[2024-07-24 23:31:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][200/625] eta 0:04:07 lr 0.001005 wd 0.0500 time 0.5722 (0.5821) data time 0.0009 (0.0043) model time 0.5714 (0.5769) loss 9.3108 (8.3050) grad_norm 1.9587 (2.0453) loss_scale 8192.0000 (6826.6667) mem 22339MB +[2024-07-24 23:31:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][210/625] eta 0:04:02 lr 0.001005 wd 0.0500 time 0.7436 (0.5848) data time 0.0008 (0.0041) model time 0.7428 (0.5807) loss 7.3991 (8.3123) grad_norm 3.0096 (2.0642) loss_scale 8192.0000 (6891.3744) mem 22339MB +[2024-07-24 23:32:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][220/625] eta 0:03:56 lr 0.001005 wd 0.0500 time 0.5741 (0.5843) data time 0.0008 (0.0040) model time 0.5733 (0.5803) loss 8.7682 (8.3157) grad_norm 3.6623 (2.1107) loss_scale 8192.0000 (6950.2262) mem 22339MB +[2024-07-24 23:32:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][230/625] eta 0:03:50 lr 0.001005 wd 0.0500 time 0.5823 (0.5840) data time 0.0008 (0.0038) model time 0.5815 (0.5801) loss 8.2975 (8.3250) grad_norm 2.0162 (2.1224) loss_scale 8192.0000 (7003.9827) mem 22339MB +[2024-07-24 23:32:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][240/625] eta 0:03:44 lr 0.001005 wd 0.0500 time 0.5766 (0.5837) data time 0.0010 (0.0037) model time 0.5756 (0.5799) loss 7.3161 (8.3006) grad_norm 1.6766 (2.1139) loss_scale 8192.0000 (7053.2780) mem 22339MB +[2024-07-24 23:32:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][250/625] eta 0:03:38 lr 0.001005 wd 0.0500 time 0.5764 (0.5834) data time 0.0006 (0.0036) model time 0.5758 (0.5796) loss 8.6067 (8.3185) grad_norm 2.0710 (2.1080) loss_scale 8192.0000 (7098.6454) mem 22339MB +[2024-07-24 23:32:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][260/625] eta 0:03:32 lr 0.001005 wd 0.0500 time 0.5792 (0.5831) data time 0.0010 (0.0035) model time 0.5782 (0.5794) loss 7.2057 (8.2993) grad_norm 1.4590 (2.1005) loss_scale 8192.0000 (7140.5364) mem 22339MB +[2024-07-24 23:32:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][270/625] eta 0:03:26 lr 0.001005 wd 0.0500 time 0.5753 (0.5829) data time 0.0008 (0.0034) model time 0.5745 (0.5793) loss 7.6044 (8.3210) grad_norm 1.9313 (2.0876) loss_scale 8192.0000 (7179.3358) mem 22339MB +[2024-07-24 23:32:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][280/625] eta 0:03:20 lr 0.001005 wd 0.0500 time 0.5725 (0.5826) data time 0.0009 (0.0033) model time 0.5716 (0.5790) loss 9.6753 (8.3161) grad_norm 1.6555 (2.0834) loss_scale 8192.0000 (7215.3737) mem 22339MB +[2024-07-24 23:32:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][290/625] eta 0:03:15 lr 0.001004 wd 0.0500 time 0.5765 (0.5823) data time 0.0008 (0.0032) model time 0.5757 (0.5788) loss 9.3750 (8.3020) grad_norm 1.9940 (2.0797) loss_scale 8192.0000 (7248.9347) mem 22339MB +[2024-07-24 23:32:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][300/625] eta 0:03:09 lr 0.001004 wd 0.0500 time 0.5747 (0.5820) data time 0.0007 (0.0032) model time 0.5739 (0.5786) loss 10.2731 (8.3041) grad_norm 1.8837 (2.0787) loss_scale 8192.0000 (7280.2658) mem 22339MB +[2024-07-24 23:32:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][310/625] eta 0:03:03 lr 0.001004 wd 0.0500 time 0.5760 (0.5818) data time 0.0006 (0.0031) model time 0.5754 (0.5784) loss 8.5244 (8.2972) grad_norm 2.0693 (2.0850) loss_scale 8192.0000 (7309.5820) mem 22339MB +[2024-07-24 23:32:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][320/625] eta 0:02:57 lr 0.001004 wd 0.0500 time 0.5788 (0.5816) data time 0.0006 (0.0030) model time 0.5781 (0.5783) loss 7.7550 (8.2912) grad_norm 2.0385 (2.0827) loss_scale 8192.0000 (7337.0717) mem 22339MB +[2024-07-24 23:33:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][330/625] eta 0:02:51 lr 0.001004 wd 0.0500 time 0.5764 (0.5815) data time 0.0006 (0.0029) model time 0.5758 (0.5782) loss 6.8132 (8.2750) grad_norm 2.0330 (2.0823) loss_scale 8192.0000 (7362.9003) mem 22339MB +[2024-07-24 23:33:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][340/625] eta 0:02:45 lr 0.001004 wd 0.0500 time 0.5739 (0.5814) data time 0.0008 (0.0029) model time 0.5731 (0.5782) loss 6.3488 (8.2695) grad_norm 2.4416 (2.0890) loss_scale 8192.0000 (7387.2141) mem 22339MB +[2024-07-24 23:33:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][350/625] eta 0:02:39 lr 0.001004 wd 0.0500 time 0.5756 (0.5813) data time 0.0007 (0.0028) model time 0.5749 (0.5781) loss 7.9056 (8.2619) grad_norm 2.1549 (2.0923) loss_scale 8192.0000 (7410.1425) mem 22339MB +[2024-07-24 23:33:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][360/625] eta 0:02:33 lr 0.001004 wd 0.0500 time 0.5749 (0.5811) data time 0.0006 (0.0028) model time 0.5743 (0.5779) loss 6.7789 (8.2423) grad_norm 1.6774 (2.0907) loss_scale 8192.0000 (7431.8006) mem 22339MB +[2024-07-24 23:33:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][370/625] eta 0:02:28 lr 0.001004 wd 0.0500 time 0.5849 (0.5809) data time 0.0006 (0.0027) model time 0.5842 (0.5779) loss 7.5644 (8.2387) grad_norm 1.8137 (2.0813) loss_scale 8192.0000 (7452.2911) mem 22339MB +[2024-07-24 23:33:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][380/625] eta 0:02:22 lr 0.001004 wd 0.0500 time 0.5831 (0.5808) data time 0.0007 (0.0027) model time 0.5823 (0.5778) loss 8.8692 (8.2348) grad_norm 1.5435 (2.0821) loss_scale 8192.0000 (7471.7060) mem 22339MB +[2024-07-24 23:33:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][390/625] eta 0:02:16 lr 0.001004 wd 0.0500 time 0.5774 (0.5807) data time 0.0008 (0.0026) model time 0.5766 (0.5777) loss 9.1659 (8.2233) grad_norm 2.3567 (2.0792) loss_scale 8192.0000 (7490.1279) mem 22339MB +[2024-07-24 23:33:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][400/625] eta 0:02:10 lr 0.001004 wd 0.0500 time 0.5748 (0.5805) data time 0.0008 (0.0026) model time 0.5740 (0.5776) loss 6.9419 (8.2302) grad_norm 1.5792 (2.0819) loss_scale 8192.0000 (7507.6309) mem 22339MB +[2024-07-24 23:33:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][410/625] eta 0:02:04 lr 0.001004 wd 0.0500 time 0.5789 (0.5804) data time 0.0009 (0.0025) model time 0.5780 (0.5776) loss 9.3751 (8.2248) grad_norm 1.6196 (2.0818) loss_scale 8192.0000 (7524.2822) mem 22339MB +[2024-07-24 23:33:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][420/625] eta 0:01:59 lr 0.001003 wd 0.0500 time 0.7596 (0.5820) data time 0.0006 (0.0025) model time 0.7590 (0.5794) loss 7.7816 (8.2266) grad_norm 1.9247 (inf) loss_scale 4096.0000 (7452.5796) mem 22339MB +[2024-07-24 23:34:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][430/625] eta 0:01:53 lr 0.001003 wd 0.0500 time 0.5756 (0.5833) data time 0.0006 (0.0025) model time 0.5751 (0.5809) loss 8.5853 (8.2325) grad_norm 2.9927 (inf) loss_scale 4096.0000 (7374.7007) mem 22339MB +[2024-07-24 23:34:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][440/625] eta 0:01:47 lr 0.001003 wd 0.0500 time 0.5778 (0.5835) data time 0.0006 (0.0024) model time 0.5772 (0.5811) loss 8.5447 (8.2358) grad_norm 2.1204 (inf) loss_scale 4096.0000 (7300.3537) mem 22339MB +[2024-07-24 23:34:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][450/625] eta 0:01:42 lr 0.001003 wd 0.0500 time 0.5801 (0.5833) data time 0.0008 (0.0024) model time 0.5793 (0.5810) loss 7.5260 (8.2310) grad_norm 1.4132 (inf) loss_scale 4096.0000 (7229.3038) mem 22339MB +[2024-07-24 23:34:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][460/625] eta 0:01:36 lr 0.001003 wd 0.0500 time 0.5775 (0.5831) data time 0.0008 (0.0024) model time 0.5767 (0.5808) loss 8.9612 (8.2280) grad_norm 2.5365 (inf) loss_scale 4096.0000 (7161.3362) mem 22339MB +[2024-07-24 23:34:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][470/625] eta 0:01:30 lr 0.001003 wd 0.0500 time 0.5778 (0.5829) data time 0.0006 (0.0023) model time 0.5772 (0.5806) loss 8.5454 (8.2305) grad_norm 1.7463 (inf) loss_scale 4096.0000 (7096.2548) mem 22339MB +[2024-07-24 23:34:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][480/625] eta 0:01:24 lr 0.001003 wd 0.0500 time 0.5915 (0.5828) data time 0.0006 (0.0023) model time 0.5909 (0.5805) loss 7.2842 (8.2354) grad_norm 2.5713 (inf) loss_scale 4096.0000 (7033.8794) mem 22339MB +[2024-07-24 23:34:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][490/625] eta 0:01:18 lr 0.001003 wd 0.0500 time 0.5750 (0.5827) data time 0.0008 (0.0023) model time 0.5742 (0.5804) loss 9.8478 (8.2348) grad_norm 1.6834 (inf) loss_scale 4096.0000 (6974.0448) mem 22339MB +[2024-07-24 23:34:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][500/625] eta 0:01:12 lr 0.001003 wd 0.0500 time 0.5741 (0.5825) data time 0.0006 (0.0022) model time 0.5735 (0.5802) loss 7.3245 (8.2354) grad_norm 2.4647 (inf) loss_scale 4096.0000 (6916.5988) mem 22339MB +[2024-07-24 23:34:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][510/625] eta 0:01:06 lr 0.001003 wd 0.0500 time 0.5765 (0.5823) data time 0.0006 (0.0022) model time 0.5759 (0.5801) loss 9.4761 (8.2426) grad_norm 1.7167 (inf) loss_scale 4096.0000 (6861.4012) mem 22339MB +[2024-07-24 23:34:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][520/625] eta 0:01:01 lr 0.001003 wd 0.0500 time 0.5759 (0.5822) data time 0.0006 (0.0022) model time 0.5753 (0.5800) loss 8.6189 (8.2379) grad_norm 1.9902 (inf) loss_scale 4096.0000 (6808.3225) mem 22339MB +[2024-07-24 23:35:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][530/625] eta 0:00:55 lr 0.001003 wd 0.0500 time 0.5757 (0.5821) data time 0.0006 (0.0022) model time 0.5752 (0.5799) loss 8.1997 (8.2299) grad_norm 1.6422 (inf) loss_scale 4096.0000 (6757.2429) mem 22339MB +[2024-07-24 23:35:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][540/625] eta 0:00:49 lr 0.001002 wd 0.0500 time 0.5765 (0.5820) data time 0.0006 (0.0021) model time 0.5760 (0.5798) loss 6.7169 (8.2302) grad_norm 1.8704 (inf) loss_scale 4096.0000 (6708.0518) mem 22339MB +[2024-07-24 23:35:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][550/625] eta 0:00:43 lr 0.001002 wd 0.0500 time 0.5745 (0.5819) data time 0.0008 (0.0021) model time 0.5737 (0.5797) loss 9.7070 (8.2351) grad_norm 1.8577 (inf) loss_scale 4096.0000 (6660.6461) mem 22339MB +[2024-07-24 23:35:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][560/625] eta 0:00:37 lr 0.001002 wd 0.0500 time 0.5762 (0.5820) data time 0.0006 (0.0021) model time 0.5756 (0.5799) loss 8.4200 (8.2315) grad_norm 1.9473 (inf) loss_scale 4096.0000 (6614.9305) mem 22339MB +[2024-07-24 23:35:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][570/625] eta 0:00:32 lr 0.001002 wd 0.0500 time 0.5766 (0.5819) data time 0.0007 (0.0021) model time 0.5758 (0.5798) loss 7.6890 (8.2255) grad_norm 2.4043 (inf) loss_scale 4096.0000 (6570.8161) mem 22339MB +[2024-07-24 23:35:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][580/625] eta 0:00:26 lr 0.001002 wd 0.0500 time 0.5881 (0.5818) data time 0.0007 (0.0021) model time 0.5874 (0.5797) loss 7.1498 (8.2204) grad_norm 2.1159 (inf) loss_scale 4096.0000 (6528.2203) mem 22339MB +[2024-07-24 23:35:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][590/625] eta 0:00:20 lr 0.001002 wd 0.0500 time 0.5749 (0.5817) data time 0.0008 (0.0020) model time 0.5741 (0.5796) loss 9.6195 (8.2206) grad_norm 2.4789 (inf) loss_scale 4096.0000 (6487.0660) mem 22339MB +[2024-07-24 23:35:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][600/625] eta 0:00:14 lr 0.001002 wd 0.0500 time 0.5772 (0.5816) data time 0.0006 (0.0020) model time 0.5766 (0.5795) loss 7.9978 (8.2246) grad_norm 1.6480 (inf) loss_scale 4096.0000 (6447.2812) mem 22339MB +[2024-07-24 23:35:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][610/625] eta 0:00:08 lr 0.001002 wd 0.0500 time 0.5747 (0.5815) data time 0.0006 (0.0020) model time 0.5741 (0.5794) loss 8.1978 (8.2203) grad_norm 1.6697 (inf) loss_scale 4096.0000 (6408.7987) mem 22339MB +[2024-07-24 23:35:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [94/300][620/625] eta 0:00:02 lr 0.001002 wd 0.0500 time 0.5828 (0.5814) data time 0.0004 (0.0020) model time 0.5824 (0.5794) loss 8.8042 (8.2210) grad_norm 1.9612 (inf) loss_scale 4096.0000 (6371.5556) mem 22339MB +[2024-07-24 23:35:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 94 training takes 0:06:03 +[2024-07-24 23:35:55 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-24 23:35:57 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-24 23:35:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.511 (0.511) Loss 0.5254 (0.5254) Acc@1 88.818 (88.818) Acc@5 98.486 (98.486) Mem 22339MB +[2024-07-24 23:35:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.161) Loss 0.8711 (0.6711) Acc@1 78.662 (84.908) Acc@5 95.410 (97.408) Mem 22339MB +[2024-07-24 23:36:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.144) Loss 0.9761 (0.8001) Acc@1 74.805 (81.450) Acc@5 94.531 (96.001) Mem 22339MB +[2024-07-24 23:36:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.200 Acc@5 95.997 +[2024-07-24 23:36:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.2% +[2024-07-24 23:36:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 1.062 (1.062) Loss 0.5435 (0.5435) Acc@1 89.111 (89.111) Acc@5 98.584 (98.584) Mem 22339MB +[2024-07-24 23:36:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.211) Loss 0.8613 (0.6754) Acc@1 79.834 (85.720) Acc@5 95.898 (97.590) Mem 22339MB +[2024-07-24 23:36:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.170) Loss 0.9697 (0.7892) Acc@1 75.977 (82.447) Acc@5 94.971 (96.350) Mem 22339MB +[2024-07-24 23:36:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.172 Acc@5 96.373 +[2024-07-24 23:36:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.2% +[2024-07-24 23:36:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.17% +[2024-07-24 23:36:04 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-24 23:36:06 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-24 23:36:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][0/625] eta 0:09:05 lr 0.001002 wd 0.0500 time 0.8729 (0.8729) data time 0.3442 (0.3442) model time 0.0000 (0.0000) loss 6.8349 (6.8349) grad_norm 1.8774 (1.8774) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:36:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][10/625] eta 0:06:18 lr 0.001002 wd 0.0500 time 0.5608 (0.6150) data time 0.0008 (0.0320) model time 0.0000 (0.0000) loss 9.0203 (8.2806) grad_norm 2.2613 (1.8593) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:36:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][20/625] eta 0:06:21 lr 0.001002 wd 0.0500 time 0.5737 (0.6302) data time 0.0006 (0.0171) model time 0.0000 (0.0000) loss 6.7898 (7.8453) grad_norm 2.3489 (2.0045) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:36:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][30/625] eta 0:06:12 lr 0.001002 wd 0.0500 time 0.5754 (0.6268) data time 0.0006 (0.0118) model time 0.0000 (0.0000) loss 8.2361 (7.9995) grad_norm 1.8837 (1.9889) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:36:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][40/625] eta 0:05:59 lr 0.001001 wd 0.0500 time 0.5730 (0.6141) data time 0.0006 (0.0092) model time 0.0000 (0.0000) loss 6.9211 (7.9862) grad_norm 2.4133 (1.9744) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:36:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][50/625] eta 0:05:48 lr 0.001001 wd 0.0500 time 0.5731 (0.6065) data time 0.0007 (0.0075) model time 0.0000 (0.0000) loss 8.0879 (8.0101) grad_norm 1.5080 (1.9751) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:36:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][60/625] eta 0:05:39 lr 0.001001 wd 0.0500 time 0.5778 (0.6012) data time 0.0006 (0.0064) model time 0.5772 (0.5735) loss 9.8594 (8.0903) grad_norm 1.7264 (1.9666) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:36:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][70/625] eta 0:05:31 lr 0.001001 wd 0.0500 time 0.5728 (0.5975) data time 0.0006 (0.0056) model time 0.5721 (0.5738) loss 7.3301 (8.0626) grad_norm 1.8131 (2.0453) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:36:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][80/625] eta 0:05:24 lr 0.001001 wd 0.0500 time 0.5722 (0.5946) data time 0.0006 (0.0050) model time 0.5715 (0.5736) loss 8.9506 (8.1281) grad_norm 2.7697 (2.0872) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:37:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][90/625] eta 0:05:17 lr 0.001001 wd 0.0500 time 0.5743 (0.5931) data time 0.0008 (0.0046) model time 0.5735 (0.5751) loss 7.0864 (8.0923) grad_norm 1.5571 (2.0843) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:37:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][100/625] eta 0:05:10 lr 0.001001 wd 0.0500 time 0.5760 (0.5914) data time 0.0006 (0.0042) model time 0.5754 (0.5752) loss 9.9019 (8.1260) grad_norm 1.7298 (2.0565) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:37:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][110/625] eta 0:05:03 lr 0.001001 wd 0.0500 time 0.5746 (0.5899) data time 0.0006 (0.0039) model time 0.5740 (0.5750) loss 8.7532 (8.1837) grad_norm 2.6188 (2.0794) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:37:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][120/625] eta 0:04:57 lr 0.001001 wd 0.0500 time 0.5699 (0.5887) data time 0.0006 (0.0036) model time 0.5693 (0.5750) loss 6.5269 (8.2052) grad_norm 2.0027 (2.0605) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:37:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][130/625] eta 0:04:51 lr 0.001001 wd 0.0500 time 0.5724 (0.5880) data time 0.0009 (0.0034) model time 0.5716 (0.5753) loss 7.2766 (8.2114) grad_norm 2.4685 (2.0506) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:37:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][140/625] eta 0:04:44 lr 0.001001 wd 0.0500 time 0.5724 (0.5870) data time 0.0006 (0.0032) model time 0.5717 (0.5752) loss 7.5036 (8.1984) grad_norm 1.7958 (2.0404) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:37:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][150/625] eta 0:04:38 lr 0.001001 wd 0.0500 time 0.5730 (0.5863) data time 0.0008 (0.0031) model time 0.5722 (0.5752) loss 7.4487 (8.1716) grad_norm 2.1971 (2.0350) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:37:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][160/625] eta 0:04:32 lr 0.001001 wd 0.0500 time 0.5767 (0.5856) data time 0.0006 (0.0029) model time 0.5760 (0.5750) loss 8.0401 (8.1478) grad_norm 1.4034 (2.0343) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:37:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][170/625] eta 0:04:26 lr 0.001000 wd 0.0500 time 0.5711 (0.5849) data time 0.0006 (0.0028) model time 0.5704 (0.5750) loss 8.5980 (8.1682) grad_norm 1.7133 (2.0343) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:37:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][180/625] eta 0:04:20 lr 0.001000 wd 0.0500 time 0.5763 (0.5845) data time 0.0008 (0.0027) model time 0.5755 (0.5750) loss 9.0934 (8.1749) grad_norm 1.3554 (2.0226) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:37:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][190/625] eta 0:04:14 lr 0.001000 wd 0.0500 time 0.5721 (0.5840) data time 0.0006 (0.0026) model time 0.5715 (0.5750) loss 7.5119 (8.1791) grad_norm 1.8353 (2.0155) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:38:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][200/625] eta 0:04:08 lr 0.001000 wd 0.0500 time 0.5704 (0.5836) data time 0.0008 (0.0025) model time 0.5696 (0.5750) loss 7.2544 (8.1712) grad_norm 2.0605 (2.0077) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:38:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][210/625] eta 0:04:02 lr 0.001000 wd 0.0500 time 0.5705 (0.5833) data time 0.0007 (0.0024) model time 0.5698 (0.5750) loss 7.3400 (8.1475) grad_norm 2.4609 (2.0073) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:38:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][220/625] eta 0:03:56 lr 0.001000 wd 0.0500 time 0.5758 (0.5829) data time 0.0007 (0.0024) model time 0.5751 (0.5750) loss 7.3183 (8.1458) grad_norm 1.8090 (2.0350) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:38:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][230/625] eta 0:03:50 lr 0.001000 wd 0.0500 time 0.5695 (0.5825) data time 0.0008 (0.0023) model time 0.5688 (0.5749) loss 7.2429 (8.1332) grad_norm 1.4655 (2.0309) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:38:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][240/625] eta 0:03:45 lr 0.001000 wd 0.0500 time 0.7102 (0.5857) data time 0.0006 (0.0022) model time 0.7096 (0.5792) loss 9.1953 (8.1364) grad_norm 1.4044 (2.0304) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:38:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][250/625] eta 0:03:39 lr 0.001000 wd 0.0500 time 0.5821 (0.5866) data time 0.0011 (0.0022) model time 0.5809 (0.5807) loss 9.7985 (8.1405) grad_norm 1.8030 (2.0230) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:38:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][260/625] eta 0:03:33 lr 0.001000 wd 0.0500 time 0.5763 (0.5863) data time 0.0008 (0.0021) model time 0.5755 (0.5805) loss 9.2544 (8.1524) grad_norm 1.8207 (2.0302) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:38:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][270/625] eta 0:03:27 lr 0.001000 wd 0.0500 time 0.5704 (0.5858) data time 0.0008 (0.0021) model time 0.5696 (0.5802) loss 8.4310 (8.1577) grad_norm 1.6070 (2.0389) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:38:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][280/625] eta 0:03:21 lr 0.001000 wd 0.0500 time 0.5712 (0.5855) data time 0.0008 (0.0020) model time 0.5705 (0.5800) loss 8.8817 (8.1556) grad_norm 1.8798 (2.0374) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:38:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][290/625] eta 0:03:16 lr 0.000999 wd 0.0500 time 0.5732 (0.5852) data time 0.0008 (0.0020) model time 0.5724 (0.5798) loss 9.7191 (8.1648) grad_norm 2.1660 (2.0312) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:39:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][300/625] eta 0:03:10 lr 0.000999 wd 0.0500 time 0.5730 (0.5848) data time 0.0006 (0.0020) model time 0.5724 (0.5796) loss 7.2317 (8.1692) grad_norm 2.4914 (2.0314) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:39:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][310/625] eta 0:03:04 lr 0.000999 wd 0.0500 time 0.5705 (0.5848) data time 0.0007 (0.0019) model time 0.5698 (0.5797) loss 7.5284 (8.1580) grad_norm 1.8053 (2.0290) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:39:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][320/625] eta 0:02:58 lr 0.000999 wd 0.0500 time 0.5689 (0.5845) data time 0.0008 (0.0019) model time 0.5680 (0.5795) loss 7.8385 (8.1549) grad_norm 2.4810 (2.0378) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:39:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][330/625] eta 0:02:52 lr 0.000999 wd 0.0500 time 0.5710 (0.5843) data time 0.0006 (0.0019) model time 0.5704 (0.5794) loss 9.0577 (8.1544) grad_norm 1.8476 (2.0313) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:39:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][340/625] eta 0:02:46 lr 0.000999 wd 0.0500 time 0.5723 (0.5840) data time 0.0008 (0.0018) model time 0.5715 (0.5793) loss 7.0164 (8.1441) grad_norm 2.3149 (2.0407) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:39:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][350/625] eta 0:02:40 lr 0.000999 wd 0.0500 time 0.5722 (0.5838) data time 0.0008 (0.0018) model time 0.5714 (0.5791) loss 9.1298 (8.1416) grad_norm 2.8234 (2.0601) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:39:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][360/625] eta 0:02:34 lr 0.000999 wd 0.0500 time 0.5756 (0.5836) data time 0.0008 (0.0018) model time 0.5748 (0.5790) loss 8.5336 (8.1373) grad_norm 2.7406 (2.0615) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:39:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][370/625] eta 0:02:28 lr 0.000999 wd 0.0500 time 0.5845 (0.5834) data time 0.0008 (0.0017) model time 0.5837 (0.5789) loss 7.9744 (8.1343) grad_norm 3.7711 (2.0616) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:39:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][380/625] eta 0:02:22 lr 0.000999 wd 0.0500 time 0.5741 (0.5832) data time 0.0006 (0.0017) model time 0.5735 (0.5788) loss 7.9108 (8.1359) grad_norm 1.7230 (2.0721) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:39:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][390/625] eta 0:02:17 lr 0.000999 wd 0.0500 time 0.5696 (0.5830) data time 0.0006 (0.0017) model time 0.5690 (0.5786) loss 8.6017 (8.1340) grad_norm 3.4273 (2.0756) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:39:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][400/625] eta 0:02:11 lr 0.000999 wd 0.0500 time 0.5632 (0.5828) data time 0.0006 (0.0017) model time 0.5625 (0.5786) loss 7.0351 (8.1294) grad_norm 1.8864 (2.0762) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:40:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][410/625] eta 0:02:05 lr 0.000999 wd 0.0500 time 0.5726 (0.5827) data time 0.0006 (0.0017) model time 0.5719 (0.5785) loss 7.3978 (8.1355) grad_norm 2.0417 (2.0754) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:40:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][420/625] eta 0:01:59 lr 0.000998 wd 0.0500 time 0.5678 (0.5825) data time 0.0008 (0.0016) model time 0.5670 (0.5784) loss 8.0369 (8.1267) grad_norm 1.8313 (2.0724) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:40:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][430/625] eta 0:01:53 lr 0.000998 wd 0.0500 time 0.5707 (0.5824) data time 0.0008 (0.0016) model time 0.5698 (0.5784) loss 7.9750 (8.1263) grad_norm 1.6556 (2.0723) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:40:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][440/625] eta 0:01:47 lr 0.000998 wd 0.0500 time 0.5612 (0.5824) data time 0.0007 (0.0016) model time 0.5606 (0.5784) loss 6.9505 (8.1371) grad_norm 1.7808 (2.0699) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:40:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][450/625] eta 0:01:41 lr 0.000998 wd 0.0500 time 0.7105 (0.5825) data time 0.0007 (0.0016) model time 0.7097 (0.5786) loss 8.4326 (8.1419) grad_norm 2.3662 (2.0661) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:40:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][460/625] eta 0:01:36 lr 0.000998 wd 0.0500 time 0.7725 (0.5842) data time 0.0007 (0.0016) model time 0.7718 (0.5806) loss 9.2371 (8.1448) grad_norm 1.7550 (2.0688) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:40:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][470/625] eta 0:01:30 lr 0.000998 wd 0.0500 time 0.5682 (0.5849) data time 0.0007 (0.0015) model time 0.5675 (0.5815) loss 8.7304 (8.1491) grad_norm 1.8198 (2.0675) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:40:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][480/625] eta 0:01:24 lr 0.000998 wd 0.0500 time 0.5740 (0.5847) data time 0.0006 (0.0015) model time 0.5734 (0.5812) loss 8.1354 (8.1438) grad_norm 1.6775 (2.0677) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:40:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][490/625] eta 0:01:18 lr 0.000998 wd 0.0500 time 0.5744 (0.5844) data time 0.0008 (0.0015) model time 0.5736 (0.5811) loss 8.9215 (8.1541) grad_norm 2.1376 (2.0688) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:40:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][500/625] eta 0:01:13 lr 0.000998 wd 0.0500 time 0.5745 (0.5842) data time 0.0006 (0.0015) model time 0.5740 (0.5809) loss 9.0622 (8.1574) grad_norm 2.0660 (2.0653) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:41:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][510/625] eta 0:01:07 lr 0.000998 wd 0.0500 time 0.5732 (0.5841) data time 0.0008 (0.0015) model time 0.5723 (0.5808) loss 7.5821 (8.1603) grad_norm 1.8764 (2.0623) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:41:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][520/625] eta 0:01:01 lr 0.000998 wd 0.0500 time 0.5735 (0.5839) data time 0.0007 (0.0015) model time 0.5728 (0.5806) loss 8.3336 (8.1622) grad_norm 1.6454 (2.0598) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:41:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][530/625] eta 0:00:55 lr 0.000998 wd 0.0500 time 0.5749 (0.5840) data time 0.0007 (0.0015) model time 0.5741 (0.5808) loss 9.5804 (8.1660) grad_norm 1.6252 (2.0543) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:41:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][540/625] eta 0:00:49 lr 0.000997 wd 0.0500 time 0.5713 (0.5839) data time 0.0006 (0.0015) model time 0.5707 (0.5807) loss 9.0800 (8.1710) grad_norm 1.5370 (2.0515) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:41:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][550/625] eta 0:00:43 lr 0.000997 wd 0.0500 time 0.5703 (0.5837) data time 0.0006 (0.0014) model time 0.5697 (0.5805) loss 8.1229 (8.1732) grad_norm 2.3036 (2.0545) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:41:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][560/625] eta 0:00:37 lr 0.000997 wd 0.0500 time 0.5717 (0.5835) data time 0.0006 (0.0014) model time 0.5710 (0.5804) loss 8.0285 (8.1713) grad_norm 1.6765 (2.0642) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:41:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][570/625] eta 0:00:32 lr 0.000997 wd 0.0500 time 0.5708 (0.5834) data time 0.0006 (0.0014) model time 0.5702 (0.5803) loss 6.7260 (8.1724) grad_norm 2.0894 (2.0749) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:41:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][580/625] eta 0:00:26 lr 0.000997 wd 0.0500 time 0.5716 (0.5833) data time 0.0006 (0.0014) model time 0.5710 (0.5802) loss 7.2510 (8.1721) grad_norm 2.6987 (2.0764) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:41:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][590/625] eta 0:00:20 lr 0.000997 wd 0.0500 time 0.5703 (0.5832) data time 0.0007 (0.0014) model time 0.5695 (0.5801) loss 9.1115 (8.1743) grad_norm 2.0603 (2.0779) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:41:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][600/625] eta 0:00:14 lr 0.000997 wd 0.0500 time 0.5724 (0.5830) data time 0.0008 (0.0014) model time 0.5716 (0.5800) loss 8.6818 (8.1747) grad_norm 1.6908 (2.0745) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:42:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][610/625] eta 0:00:08 lr 0.000997 wd 0.0500 time 0.5680 (0.5829) data time 0.0006 (0.0014) model time 0.5674 (0.5799) loss 7.2006 (8.1737) grad_norm 1.6607 (2.0798) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:42:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [95/300][620/625] eta 0:00:02 lr 0.000997 wd 0.0500 time 0.5718 (0.5827) data time 0.0006 (0.0014) model time 0.5713 (0.5798) loss 7.3304 (8.1838) grad_norm 2.4572 (2.0817) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:42:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 95 training takes 0:06:04 +[2024-07-24 23:42:10 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-24 23:42:11 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-24 23:42:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.448 (0.448) Loss 0.5435 (0.5435) Acc@1 88.916 (88.916) Acc@5 98.242 (98.242) Mem 22339MB +[2024-07-24 23:42:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.155) Loss 0.8979 (0.6849) Acc@1 79.004 (84.885) Acc@5 95.215 (97.297) Mem 22339MB +[2024-07-24 23:42:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.141) Loss 1.0244 (0.8081) Acc@1 75.537 (81.657) Acc@5 93.750 (95.947) Mem 22339MB +[2024-07-24 23:42:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.376 Acc@5 95.947 +[2024-07-24 23:42:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.4% +[2024-07-24 23:42:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 81.38% +[2024-07-24 23:42:15 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-24 23:42:16 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-24 23:42:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.465 (0.465) Loss 0.5400 (0.5400) Acc@1 89.209 (89.209) Acc@5 98.584 (98.584) Mem 22339MB +[2024-07-24 23:42:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.156) Loss 0.8579 (0.6727) Acc@1 79.834 (85.707) Acc@5 95.850 (97.603) Mem 22339MB +[2024-07-24 23:42:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9653 (0.7864) Acc@1 76.270 (82.466) Acc@5 94.971 (96.368) Mem 22339MB +[2024-07-24 23:42:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.186 Acc@5 96.389 +[2024-07-24 23:42:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.2% +[2024-07-24 23:42:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.19% +[2024-07-24 23:42:19 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-24 23:42:21 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-24 23:42:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][0/625] eta 0:09:50 lr 0.000997 wd 0.0500 time 0.9442 (0.9442) data time 0.4270 (0.4270) model time 0.0000 (0.0000) loss 7.1198 (7.1198) grad_norm 2.1870 (2.1870) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:42:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][10/625] eta 0:06:13 lr 0.000997 wd 0.0500 time 0.5722 (0.6078) data time 0.0006 (0.0396) model time 0.0000 (0.0000) loss 9.2302 (8.2558) grad_norm 1.7905 (1.8786) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:42:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][20/625] eta 0:05:58 lr 0.000997 wd 0.0500 time 0.5738 (0.5918) data time 0.0006 (0.0211) model time 0.0000 (0.0000) loss 8.4667 (8.2229) grad_norm 2.7532 (1.9736) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:42:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][30/625] eta 0:05:50 lr 0.000997 wd 0.0500 time 0.5745 (0.5896) data time 0.0008 (0.0145) model time 0.0000 (0.0000) loss 7.3108 (8.1155) grad_norm 2.4582 (1.9441) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:42:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][40/625] eta 0:05:42 lr 0.000996 wd 0.0500 time 0.5733 (0.5858) data time 0.0006 (0.0112) model time 0.0000 (0.0000) loss 6.9444 (8.2331) grad_norm 1.6688 (1.9320) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:42:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][50/625] eta 0:05:38 lr 0.000996 wd 0.0500 time 0.5689 (0.5880) data time 0.0007 (0.0091) model time 0.0000 (0.0000) loss 7.9408 (8.2094) grad_norm 1.7083 (1.9067) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:42:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][60/625] eta 0:05:38 lr 0.000996 wd 0.0500 time 0.7452 (0.5991) data time 0.0008 (0.0078) model time 0.7443 (0.6553) loss 8.3346 (8.1443) grad_norm 1.6805 (1.8704) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:43:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][70/625] eta 0:05:31 lr 0.000996 wd 0.0500 time 0.5678 (0.5973) data time 0.0008 (0.0068) model time 0.5669 (0.6204) loss 8.8246 (8.0888) grad_norm 1.8563 (1.9234) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:43:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][80/625] eta 0:05:24 lr 0.000996 wd 0.0500 time 0.5731 (0.5946) data time 0.0007 (0.0061) model time 0.5725 (0.6048) loss 7.5305 (8.1286) grad_norm 1.7875 (1.9162) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:43:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][90/625] eta 0:05:16 lr 0.000996 wd 0.0500 time 0.5730 (0.5924) data time 0.0006 (0.0056) model time 0.5724 (0.5971) loss 8.5730 (8.0800) grad_norm 1.6068 (1.9080) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:43:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][100/625] eta 0:05:10 lr 0.000996 wd 0.0500 time 0.5748 (0.5907) data time 0.0006 (0.0051) model time 0.5741 (0.5926) loss 9.2917 (8.1106) grad_norm 2.3038 (1.9742) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:43:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][110/625] eta 0:05:03 lr 0.000996 wd 0.0500 time 0.5740 (0.5893) data time 0.0007 (0.0047) model time 0.5733 (0.5895) loss 7.7268 (8.0881) grad_norm 2.4438 (1.9750) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:43:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][120/625] eta 0:04:56 lr 0.000996 wd 0.0500 time 0.5704 (0.5881) data time 0.0008 (0.0044) model time 0.5696 (0.5872) loss 8.9122 (8.0987) grad_norm 2.3440 (1.9864) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:43:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][130/625] eta 0:04:50 lr 0.000996 wd 0.0500 time 0.5753 (0.5871) data time 0.0006 (0.0041) model time 0.5747 (0.5856) loss 9.0535 (8.0691) grad_norm 1.7711 (1.9756) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:43:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][140/625] eta 0:04:44 lr 0.000996 wd 0.0500 time 0.5726 (0.5863) data time 0.0006 (0.0039) model time 0.5720 (0.5845) loss 8.4794 (8.0578) grad_norm 1.7529 (1.9705) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:43:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][150/625] eta 0:04:38 lr 0.000996 wd 0.0500 time 0.5754 (0.5856) data time 0.0006 (0.0037) model time 0.5748 (0.5835) loss 7.6128 (8.0483) grad_norm 1.5356 (1.9745) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:43:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][160/625] eta 0:04:32 lr 0.000996 wd 0.0500 time 0.5747 (0.5850) data time 0.0006 (0.0035) model time 0.5741 (0.5827) loss 8.7226 (8.0551) grad_norm 1.9758 (1.9913) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:44:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][170/625] eta 0:04:25 lr 0.000995 wd 0.0500 time 0.5744 (0.5844) data time 0.0006 (0.0033) model time 0.5738 (0.5820) loss 7.6778 (8.0530) grad_norm 2.0738 (1.9928) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:44:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][180/625] eta 0:04:19 lr 0.000995 wd 0.0500 time 0.5691 (0.5839) data time 0.0008 (0.0032) model time 0.5683 (0.5815) loss 7.0360 (8.0198) grad_norm 1.6963 (1.9910) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:44:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][190/625] eta 0:04:13 lr 0.000995 wd 0.0500 time 0.5762 (0.5835) data time 0.0006 (0.0031) model time 0.5756 (0.5810) loss 8.1132 (8.0041) grad_norm 1.9953 (2.0096) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:44:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][200/625] eta 0:04:07 lr 0.000995 wd 0.0500 time 0.5741 (0.5830) data time 0.0006 (0.0030) model time 0.5735 (0.5805) loss 8.0717 (7.9941) grad_norm 1.6754 (2.0074) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:44:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][210/625] eta 0:04:01 lr 0.000995 wd 0.0500 time 0.5731 (0.5827) data time 0.0008 (0.0029) model time 0.5722 (0.5802) loss 7.7520 (8.0050) grad_norm 1.6860 (2.0006) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:44:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][220/625] eta 0:03:55 lr 0.000995 wd 0.0500 time 0.5754 (0.5824) data time 0.0006 (0.0028) model time 0.5748 (0.5799) loss 9.4033 (8.0301) grad_norm 1.7691 (2.0028) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:44:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][230/625] eta 0:03:49 lr 0.000995 wd 0.0500 time 0.5756 (0.5821) data time 0.0006 (0.0027) model time 0.5750 (0.5796) loss 6.8936 (8.0266) grad_norm 1.9392 (1.9980) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:44:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][240/625] eta 0:03:44 lr 0.000995 wd 0.0500 time 0.7691 (0.5826) data time 0.0008 (0.0026) model time 0.7683 (0.5804) loss 8.8917 (8.0195) grad_norm 3.3684 (2.0107) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:44:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][250/625] eta 0:03:38 lr 0.000995 wd 0.0500 time 0.5705 (0.5821) data time 0.0006 (0.0025) model time 0.5699 (0.5797) loss 8.6633 (8.0142) grad_norm 1.5941 (2.0187) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:44:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][260/625] eta 0:03:32 lr 0.000995 wd 0.0500 time 0.5764 (0.5818) data time 0.0007 (0.0025) model time 0.5757 (0.5795) loss 8.6166 (8.0106) grad_norm 1.7014 (2.0208) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:44:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][270/625] eta 0:03:26 lr 0.000995 wd 0.0500 time 0.5620 (0.5828) data time 0.0009 (0.0024) model time 0.5611 (0.5807) loss 8.1746 (8.0178) grad_norm 1.5919 (2.0226) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:45:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][280/625] eta 0:03:22 lr 0.000995 wd 0.0500 time 0.7451 (0.5860) data time 0.0006 (0.0023) model time 0.7445 (0.5847) loss 7.6313 (8.0164) grad_norm 2.4188 (2.0293) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:45:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][290/625] eta 0:03:16 lr 0.000994 wd 0.0500 time 0.5735 (0.5861) data time 0.0008 (0.0023) model time 0.5726 (0.5849) loss 8.3494 (8.0322) grad_norm 1.6297 (2.0310) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:45:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][300/625] eta 0:03:10 lr 0.000994 wd 0.0500 time 0.5741 (0.5858) data time 0.0008 (0.0022) model time 0.5733 (0.5845) loss 8.0742 (8.0585) grad_norm 1.7435 (2.0248) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:45:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][310/625] eta 0:03:04 lr 0.000994 wd 0.0500 time 0.5754 (0.5854) data time 0.0008 (0.0022) model time 0.5746 (0.5841) loss 8.8031 (8.0709) grad_norm 1.7305 (2.0331) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:45:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][320/625] eta 0:02:58 lr 0.000994 wd 0.0500 time 0.5709 (0.5850) data time 0.0008 (0.0022) model time 0.5701 (0.5836) loss 7.5290 (8.0525) grad_norm 1.7649 (2.0250) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:45:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][330/625] eta 0:02:52 lr 0.000994 wd 0.0500 time 0.5733 (0.5847) data time 0.0008 (0.0021) model time 0.5725 (0.5833) loss 9.4574 (8.0524) grad_norm 2.9755 (2.0249) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:45:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][340/625] eta 0:02:46 lr 0.000994 wd 0.0500 time 0.5739 (0.5844) data time 0.0006 (0.0021) model time 0.5733 (0.5830) loss 8.4248 (8.0442) grad_norm 2.8250 (2.0282) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:45:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][350/625] eta 0:02:40 lr 0.000994 wd 0.0500 time 0.5737 (0.5842) data time 0.0006 (0.0020) model time 0.5731 (0.5827) loss 8.3287 (8.0490) grad_norm 1.5275 (2.0265) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:45:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][360/625] eta 0:02:34 lr 0.000994 wd 0.0500 time 0.5738 (0.5839) data time 0.0008 (0.0020) model time 0.5731 (0.5824) loss 9.5937 (8.0459) grad_norm 1.7518 (2.0245) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:45:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][370/625] eta 0:02:28 lr 0.000994 wd 0.0500 time 0.5727 (0.5837) data time 0.0006 (0.0020) model time 0.5721 (0.5822) loss 7.2627 (8.0412) grad_norm 1.6615 (2.0223) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:46:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][380/625] eta 0:02:22 lr 0.000994 wd 0.0500 time 0.5712 (0.5834) data time 0.0006 (0.0019) model time 0.5706 (0.5819) loss 7.7405 (8.0458) grad_norm 1.9956 (2.0274) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:46:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][390/625] eta 0:02:17 lr 0.000994 wd 0.0500 time 0.5733 (0.5832) data time 0.0008 (0.0019) model time 0.5725 (0.5817) loss 7.2632 (8.0407) grad_norm 2.4152 (2.0398) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:46:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][400/625] eta 0:02:11 lr 0.000994 wd 0.0500 time 0.5750 (0.5830) data time 0.0006 (0.0019) model time 0.5743 (0.5814) loss 8.9576 (8.0382) grad_norm 2.0287 (2.0444) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:46:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][410/625] eta 0:02:05 lr 0.000994 wd 0.0500 time 0.5742 (0.5828) data time 0.0009 (0.0019) model time 0.5733 (0.5812) loss 6.2941 (8.0340) grad_norm 1.9420 (2.0463) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:46:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][420/625] eta 0:01:59 lr 0.000993 wd 0.0500 time 0.5743 (0.5826) data time 0.0006 (0.0018) model time 0.5736 (0.5810) loss 8.7757 (8.0397) grad_norm 1.7186 (2.0432) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:46:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][430/625] eta 0:01:53 lr 0.000993 wd 0.0500 time 0.5735 (0.5824) data time 0.0010 (0.0018) model time 0.5726 (0.5809) loss 9.0536 (8.0464) grad_norm 1.7745 (2.0435) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:46:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][440/625] eta 0:01:47 lr 0.000993 wd 0.0500 time 0.5779 (0.5822) data time 0.0007 (0.0018) model time 0.5772 (0.5807) loss 9.0232 (8.0456) grad_norm 2.1180 (2.0443) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:46:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][450/625] eta 0:01:41 lr 0.000993 wd 0.0500 time 0.5734 (0.5821) data time 0.0006 (0.0018) model time 0.5729 (0.5805) loss 8.1636 (8.0431) grad_norm 2.2061 (2.0460) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:46:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][460/625] eta 0:01:36 lr 0.000993 wd 0.0500 time 0.5735 (0.5820) data time 0.0009 (0.0017) model time 0.5726 (0.5804) loss 9.9016 (8.0523) grad_norm 2.3070 (2.0438) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:46:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][470/625] eta 0:01:30 lr 0.000993 wd 0.0500 time 0.5685 (0.5819) data time 0.0006 (0.0017) model time 0.5679 (0.5804) loss 8.6591 (8.0541) grad_norm 1.7134 (2.0391) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:47:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][480/625] eta 0:01:24 lr 0.000993 wd 0.0500 time 0.5741 (0.5818) data time 0.0006 (0.0017) model time 0.5735 (0.5802) loss 7.7827 (8.0451) grad_norm 1.7394 (2.0355) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:47:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][490/625] eta 0:01:18 lr 0.000993 wd 0.0500 time 0.7339 (0.5822) data time 0.0008 (0.0017) model time 0.7331 (0.5808) loss 8.3783 (8.0417) grad_norm 1.5981 (2.0331) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:47:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][500/625] eta 0:01:12 lr 0.000993 wd 0.0500 time 0.5653 (0.5835) data time 0.0006 (0.0017) model time 0.5647 (0.5822) loss 9.3106 (8.0461) grad_norm 2.3085 (2.0287) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:47:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][510/625] eta 0:01:07 lr 0.000993 wd 0.0500 time 0.5709 (0.5836) data time 0.0008 (0.0016) model time 0.5701 (0.5823) loss 7.4951 (8.0464) grad_norm 1.7911 (2.0249) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:47:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][520/625] eta 0:01:01 lr 0.000993 wd 0.0500 time 0.5737 (0.5834) data time 0.0006 (0.0016) model time 0.5731 (0.5821) loss 7.0164 (8.0387) grad_norm 2.5760 (2.0280) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:47:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][530/625] eta 0:00:55 lr 0.000993 wd 0.0500 time 0.5711 (0.5833) data time 0.0008 (0.0016) model time 0.5704 (0.5819) loss 7.8813 (8.0485) grad_norm 1.8269 (2.0292) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:47:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][540/625] eta 0:00:49 lr 0.000992 wd 0.0500 time 0.5740 (0.5831) data time 0.0008 (0.0016) model time 0.5732 (0.5818) loss 9.3380 (8.0488) grad_norm 2.1122 (2.0279) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:47:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][550/625] eta 0:00:43 lr 0.000992 wd 0.0500 time 0.5736 (0.5830) data time 0.0006 (0.0016) model time 0.5731 (0.5816) loss 6.5675 (8.0475) grad_norm 2.1608 (2.0275) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:47:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][560/625] eta 0:00:37 lr 0.000992 wd 0.0500 time 0.5716 (0.5828) data time 0.0008 (0.0016) model time 0.5708 (0.5815) loss 9.9710 (8.0514) grad_norm 1.7580 (2.0234) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:47:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][570/625] eta 0:00:32 lr 0.000992 wd 0.0500 time 0.5750 (0.5827) data time 0.0006 (0.0016) model time 0.5744 (0.5813) loss 8.3289 (8.0587) grad_norm 1.8405 (2.0225) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:47:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][580/625] eta 0:00:26 lr 0.000992 wd 0.0500 time 0.5742 (0.5825) data time 0.0006 (0.0015) model time 0.5736 (0.5812) loss 8.2974 (8.0583) grad_norm 2.2684 (2.0238) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:48:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][590/625] eta 0:00:20 lr 0.000992 wd 0.0500 time 0.5707 (0.5824) data time 0.0008 (0.0015) model time 0.5699 (0.5811) loss 8.5317 (8.0526) grad_norm 2.7482 (2.0217) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:48:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][600/625] eta 0:00:14 lr 0.000992 wd 0.0500 time 0.5718 (0.5823) data time 0.0008 (0.0015) model time 0.5711 (0.5810) loss 8.6257 (8.0527) grad_norm 2.4918 (2.0205) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:48:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][610/625] eta 0:00:08 lr 0.000992 wd 0.0500 time 0.5728 (0.5822) data time 0.0004 (0.0015) model time 0.5724 (0.5808) loss 6.2229 (8.0550) grad_norm 2.1380 (2.0272) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:48:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [96/300][620/625] eta 0:00:02 lr 0.000992 wd 0.0500 time 0.5738 (0.5820) data time 0.0005 (0.0015) model time 0.5733 (0.5807) loss 8.8928 (8.0602) grad_norm 1.6542 (2.0364) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:48:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 96 training takes 0:06:03 +[2024-07-24 23:48:25 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-24 23:48:26 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-24 23:48:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.716 (0.716) Loss 0.5508 (0.5508) Acc@1 88.770 (88.770) Acc@5 98.438 (98.438) Mem 22339MB +[2024-07-24 23:48:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.179) Loss 0.9126 (0.6937) Acc@1 76.953 (84.819) Acc@5 94.775 (97.266) Mem 22339MB +[2024-07-24 23:48:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.154) Loss 1.0234 (0.8197) Acc@1 75.342 (81.462) Acc@5 93.359 (95.852) Mem 22339MB +[2024-07-24 23:48:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.134 Acc@5 95.843 +[2024-07-24 23:48:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.1% +[2024-07-24 23:48:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.802 (0.802) Loss 0.5366 (0.5366) Acc@1 89.258 (89.258) Acc@5 98.584 (98.584) Mem 22339MB +[2024-07-24 23:48:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.187) Loss 0.8540 (0.6699) Acc@1 80.029 (85.769) Acc@5 95.898 (97.612) Mem 22339MB +[2024-07-24 23:48:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.158) Loss 0.9634 (0.7836) Acc@1 76.074 (82.506) Acc@5 95.020 (96.380) Mem 22339MB +[2024-07-24 23:48:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.212 Acc@5 96.397 +[2024-07-24 23:48:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.2% +[2024-07-24 23:48:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.21% +[2024-07-24 23:48:33 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-24 23:48:35 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-24 23:48:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][0/625] eta 0:15:46 lr 0.000992 wd 0.0500 time 1.5136 (1.5136) data time 0.9948 (0.9948) model time 0.0000 (0.0000) loss 7.9778 (7.9778) grad_norm 1.5564 (1.5564) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:48:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][10/625] eta 0:06:45 lr 0.000992 wd 0.0500 time 0.5730 (0.6591) data time 0.0006 (0.0911) model time 0.0000 (0.0000) loss 7.0121 (7.8765) grad_norm 1.6407 (1.7076) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:48:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][20/625] eta 0:06:14 lr 0.000992 wd 0.0500 time 0.5760 (0.6189) data time 0.0008 (0.0481) model time 0.0000 (0.0000) loss 8.8941 (8.0797) grad_norm 1.9623 (1.9329) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:48:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][30/625] eta 0:05:59 lr 0.000992 wd 0.0500 time 0.5745 (0.6043) data time 0.0006 (0.0329) model time 0.0000 (0.0000) loss 7.8703 (8.2072) grad_norm 1.5854 (1.9382) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:49:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][40/625] eta 0:05:49 lr 0.000991 wd 0.0500 time 0.5810 (0.5972) data time 0.0007 (0.0250) model time 0.0000 (0.0000) loss 7.6479 (8.2234) grad_norm 1.8663 (1.9975) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:49:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][50/625] eta 0:05:40 lr 0.000991 wd 0.0500 time 0.5738 (0.5926) data time 0.0006 (0.0203) model time 0.0000 (0.0000) loss 7.4601 (8.1633) grad_norm 2.0291 (1.9939) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:49:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][60/625] eta 0:05:33 lr 0.000991 wd 0.0500 time 0.5749 (0.5895) data time 0.0006 (0.0171) model time 0.5743 (0.5727) loss 7.9542 (8.1418) grad_norm 2.8352 (2.0296) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:49:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][70/625] eta 0:05:25 lr 0.000991 wd 0.0500 time 0.5761 (0.5874) data time 0.0008 (0.0148) model time 0.5753 (0.5732) loss 9.1650 (8.2045) grad_norm 2.1319 (2.0683) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:49:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][80/625] eta 0:05:20 lr 0.000991 wd 0.0500 time 0.7728 (0.5880) data time 0.0006 (0.0131) model time 0.7723 (0.5794) loss 7.2464 (8.1586) grad_norm 1.7277 (2.0530) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:49:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][90/625] eta 0:05:15 lr 0.000991 wd 0.0500 time 0.7477 (0.5903) data time 0.0006 (0.0117) model time 0.7471 (0.5866) loss 7.6619 (8.1516) grad_norm 3.1737 (2.0684) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:49:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][100/625] eta 0:05:13 lr 0.000991 wd 0.0500 time 0.6103 (0.5964) data time 0.0008 (0.0107) model time 0.6095 (0.5994) loss 6.6165 (8.1757) grad_norm 2.0583 (2.0579) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:49:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][110/625] eta 0:05:06 lr 0.000991 wd 0.0500 time 0.5797 (0.5946) data time 0.0006 (0.0098) model time 0.5791 (0.5955) loss 6.7986 (8.2021) grad_norm 1.6970 (2.0539) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:49:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][120/625] eta 0:04:59 lr 0.000991 wd 0.0500 time 0.5926 (0.5932) data time 0.0008 (0.0090) model time 0.5918 (0.5928) loss 8.9917 (8.1770) grad_norm 1.7220 (2.0956) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:49:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][130/625] eta 0:04:52 lr 0.000991 wd 0.0500 time 0.5765 (0.5917) data time 0.0008 (0.0084) model time 0.5757 (0.5904) loss 10.0046 (8.1667) grad_norm 1.5893 (2.0805) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:49:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][140/625] eta 0:04:46 lr 0.000991 wd 0.0500 time 0.5888 (0.5908) data time 0.0007 (0.0079) model time 0.5881 (0.5889) loss 7.3399 (8.1740) grad_norm 3.7008 (2.0924) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:50:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][150/625] eta 0:04:40 lr 0.000991 wd 0.0500 time 0.5760 (0.5898) data time 0.0007 (0.0074) model time 0.5753 (0.5875) loss 8.7439 (8.1830) grad_norm 1.5954 (2.0969) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:50:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][160/625] eta 0:04:34 lr 0.000990 wd 0.0500 time 0.5752 (0.5895) data time 0.0006 (0.0070) model time 0.5746 (0.5872) loss 7.1072 (8.2029) grad_norm 3.1973 (2.1077) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:50:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][170/625] eta 0:04:27 lr 0.000990 wd 0.0500 time 0.5737 (0.5887) data time 0.0008 (0.0067) model time 0.5730 (0.5861) loss 9.4107 (8.2118) grad_norm 1.9692 (2.1199) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:50:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][180/625] eta 0:04:21 lr 0.000990 wd 0.0500 time 0.5897 (0.5880) data time 0.0009 (0.0064) model time 0.5887 (0.5853) loss 6.9328 (8.1945) grad_norm 2.8033 (2.1344) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:50:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][190/625] eta 0:04:15 lr 0.000990 wd 0.0500 time 0.5735 (0.5873) data time 0.0009 (0.0061) model time 0.5726 (0.5845) loss 8.7844 (8.2074) grad_norm 1.8418 (2.1448) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:50:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][200/625] eta 0:04:09 lr 0.000990 wd 0.0500 time 0.5792 (0.5867) data time 0.0006 (0.0058) model time 0.5786 (0.5838) loss 7.5145 (8.1841) grad_norm 2.2936 (2.1356) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:50:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][210/625] eta 0:04:03 lr 0.000990 wd 0.0500 time 0.5722 (0.5860) data time 0.0006 (0.0056) model time 0.5717 (0.5830) loss 9.1360 (8.1987) grad_norm 2.2277 (2.1191) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:50:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][220/625] eta 0:03:57 lr 0.000990 wd 0.0500 time 0.5788 (0.5855) data time 0.0008 (0.0054) model time 0.5780 (0.5824) loss 8.0845 (8.1924) grad_norm 1.8548 (2.1157) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:50:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][230/625] eta 0:03:51 lr 0.000990 wd 0.0500 time 0.5718 (0.5852) data time 0.0006 (0.0052) model time 0.5712 (0.5822) loss 7.2907 (8.1821) grad_norm 1.9130 (2.1041) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:50:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][240/625] eta 0:03:45 lr 0.000990 wd 0.0500 time 0.5777 (0.5848) data time 0.0006 (0.0050) model time 0.5771 (0.5818) loss 6.7322 (8.1567) grad_norm 1.7038 (2.0945) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:51:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][250/625] eta 0:03:39 lr 0.000990 wd 0.0500 time 0.5770 (0.5844) data time 0.0008 (0.0048) model time 0.5762 (0.5814) loss 9.1120 (8.1588) grad_norm 1.8905 (2.0897) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:51:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][260/625] eta 0:03:33 lr 0.000990 wd 0.0500 time 0.5824 (0.5840) data time 0.0006 (0.0047) model time 0.5818 (0.5810) loss 8.1487 (8.1593) grad_norm 1.4183 (2.1054) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:51:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][270/625] eta 0:03:27 lr 0.000990 wd 0.0500 time 0.5777 (0.5837) data time 0.0006 (0.0045) model time 0.5771 (0.5807) loss 9.2858 (8.1562) grad_norm 1.6304 (2.1067) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:51:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][280/625] eta 0:03:21 lr 0.000989 wd 0.0500 time 0.5778 (0.5834) data time 0.0008 (0.0044) model time 0.5770 (0.5805) loss 8.3133 (8.1551) grad_norm 1.9417 (2.1082) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:51:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][290/625] eta 0:03:15 lr 0.000989 wd 0.0500 time 0.5751 (0.5831) data time 0.0006 (0.0043) model time 0.5745 (0.5802) loss 8.2265 (8.1575) grad_norm 1.7780 (2.1102) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:51:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][300/625] eta 0:03:09 lr 0.000989 wd 0.0500 time 0.5761 (0.5828) data time 0.0006 (0.0042) model time 0.5755 (0.5800) loss 7.8180 (8.1620) grad_norm 1.8548 (2.1061) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:51:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][310/625] eta 0:03:04 lr 0.000989 wd 0.0500 time 0.7406 (0.5848) data time 0.0009 (0.0041) model time 0.7397 (0.5824) loss 8.1947 (8.1772) grad_norm 2.5830 (2.1097) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:51:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][320/625] eta 0:02:58 lr 0.000989 wd 0.0500 time 0.5803 (0.5867) data time 0.0008 (0.0040) model time 0.5795 (0.5848) loss 8.2348 (8.1822) grad_norm 1.8890 (2.1075) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:51:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][330/625] eta 0:02:53 lr 0.000989 wd 0.0500 time 0.5737 (0.5868) data time 0.0006 (0.0039) model time 0.5731 (0.5849) loss 8.2693 (8.1998) grad_norm 1.7190 (2.1046) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:51:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][340/625] eta 0:02:47 lr 0.000989 wd 0.0500 time 0.5782 (0.5865) data time 0.0008 (0.0038) model time 0.5774 (0.5845) loss 8.9827 (8.1963) grad_norm 1.5249 (2.0976) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:52:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][350/625] eta 0:02:41 lr 0.000989 wd 0.0500 time 0.5760 (0.5861) data time 0.0006 (0.0037) model time 0.5754 (0.5842) loss 9.5863 (8.2018) grad_norm 1.6392 (2.0930) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:52:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][360/625] eta 0:02:35 lr 0.000989 wd 0.0500 time 0.5754 (0.5858) data time 0.0008 (0.0036) model time 0.5746 (0.5838) loss 10.1457 (8.1955) grad_norm 1.4612 (2.0900) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:52:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][370/625] eta 0:02:29 lr 0.000989 wd 0.0500 time 0.5746 (0.5855) data time 0.0008 (0.0035) model time 0.5739 (0.5835) loss 8.8651 (8.1914) grad_norm 2.0465 (2.0883) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:52:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][380/625] eta 0:02:23 lr 0.000989 wd 0.0500 time 0.5780 (0.5852) data time 0.0006 (0.0035) model time 0.5774 (0.5832) loss 6.8098 (8.1877) grad_norm 2.4832 (2.0899) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:52:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][390/625] eta 0:02:17 lr 0.000989 wd 0.0500 time 0.5744 (0.5850) data time 0.0009 (0.0034) model time 0.5736 (0.5830) loss 7.8159 (8.1862) grad_norm 1.5485 (2.0930) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:52:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][400/625] eta 0:02:11 lr 0.000989 wd 0.0500 time 0.5741 (0.5847) data time 0.0007 (0.0033) model time 0.5734 (0.5827) loss 9.8054 (8.1765) grad_norm 2.4571 (2.0965) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:52:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][410/625] eta 0:02:05 lr 0.000988 wd 0.0500 time 0.5747 (0.5845) data time 0.0009 (0.0033) model time 0.5739 (0.5825) loss 9.5759 (8.1676) grad_norm 1.5241 (2.0961) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:52:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][420/625] eta 0:01:59 lr 0.000988 wd 0.0500 time 0.5753 (0.5842) data time 0.0006 (0.0032) model time 0.5747 (0.5822) loss 7.0915 (8.1754) grad_norm 2.2050 (2.1054) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:52:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][430/625] eta 0:01:53 lr 0.000988 wd 0.0500 time 0.5723 (0.5840) data time 0.0006 (0.0032) model time 0.5717 (0.5820) loss 7.7543 (8.1646) grad_norm 2.2288 (2.1092) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:52:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][440/625] eta 0:01:47 lr 0.000988 wd 0.0500 time 0.5764 (0.5838) data time 0.0009 (0.0031) model time 0.5756 (0.5818) loss 8.7582 (8.1646) grad_norm 3.5368 (2.1085) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:52:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][450/625] eta 0:01:42 lr 0.000988 wd 0.0500 time 0.5775 (0.5840) data time 0.0008 (0.0030) model time 0.5768 (0.5820) loss 8.8488 (8.1533) grad_norm 3.3548 (2.1116) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:53:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][460/625] eta 0:01:36 lr 0.000988 wd 0.0500 time 0.5814 (0.5838) data time 0.0008 (0.0030) model time 0.5807 (0.5818) loss 9.1227 (8.1519) grad_norm 2.0093 (2.1100) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:53:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][470/625] eta 0:01:30 lr 0.000988 wd 0.0500 time 0.5777 (0.5836) data time 0.0006 (0.0030) model time 0.5771 (0.5817) loss 6.8393 (8.1508) grad_norm 2.0577 (2.1054) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:53:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][480/625] eta 0:01:24 lr 0.000988 wd 0.0500 time 0.5781 (0.5834) data time 0.0006 (0.0029) model time 0.5775 (0.5815) loss 7.0980 (8.1474) grad_norm 2.1609 (2.1097) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:53:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][490/625] eta 0:01:18 lr 0.000988 wd 0.0500 time 0.5760 (0.5833) data time 0.0007 (0.0029) model time 0.5753 (0.5813) loss 7.7739 (8.1338) grad_norm 1.8291 (2.1092) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:53:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][500/625] eta 0:01:12 lr 0.000988 wd 0.0500 time 0.5798 (0.5831) data time 0.0008 (0.0028) model time 0.5790 (0.5812) loss 8.9915 (8.1270) grad_norm 2.0060 (2.1060) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:53:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][510/625] eta 0:01:07 lr 0.000988 wd 0.0500 time 0.5827 (0.5830) data time 0.0008 (0.0028) model time 0.5819 (0.5811) loss 9.5076 (8.1312) grad_norm 2.8714 (2.1064) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:53:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][520/625] eta 0:01:01 lr 0.000988 wd 0.0500 time 0.5754 (0.5828) data time 0.0006 (0.0027) model time 0.5748 (0.5809) loss 6.7018 (8.1302) grad_norm 1.7025 (2.1101) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:53:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][530/625] eta 0:00:55 lr 0.000987 wd 0.0500 time 0.7045 (0.5838) data time 0.0006 (0.0027) model time 0.7038 (0.5820) loss 7.8630 (8.1334) grad_norm 1.7612 (2.1110) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-24 23:53:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][540/625] eta 0:00:49 lr 0.000987 wd 0.0500 time 0.5763 (0.5846) data time 0.0007 (0.0027) model time 0.5756 (0.5830) loss 9.0576 (8.1300) grad_norm 3.0647 (2.1184) loss_scale 8192.0000 (4126.2847) mem 22339MB +[2024-07-24 23:53:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][550/625] eta 0:00:43 lr 0.000987 wd 0.0500 time 0.5755 (0.5847) data time 0.0006 (0.0026) model time 0.5749 (0.5831) loss 6.4871 (8.1256) grad_norm 2.1594 (2.1159) loss_scale 8192.0000 (4200.0726) mem 22339MB +[2024-07-24 23:54:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][560/625] eta 0:00:37 lr 0.000987 wd 0.0500 time 0.5703 (0.5845) data time 0.0007 (0.0026) model time 0.5696 (0.5829) loss 9.0835 (8.1384) grad_norm 1.9437 (2.1139) loss_scale 8192.0000 (4271.2299) mem 22339MB +[2024-07-24 23:54:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][570/625] eta 0:00:32 lr 0.000987 wd 0.0500 time 0.5997 (0.5844) data time 0.0007 (0.0026) model time 0.5991 (0.5827) loss 8.4264 (8.1351) grad_norm 1.9406 (2.1097) loss_scale 8192.0000 (4339.8949) mem 22339MB +[2024-07-24 23:54:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][580/625] eta 0:00:26 lr 0.000987 wd 0.0500 time 0.5756 (0.5842) data time 0.0007 (0.0026) model time 0.5749 (0.5825) loss 8.0747 (8.1349) grad_norm 2.2365 (2.1094) loss_scale 8192.0000 (4406.1962) mem 22339MB +[2024-07-24 23:54:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][590/625] eta 0:00:20 lr 0.000987 wd 0.0500 time 0.5760 (0.5840) data time 0.0008 (0.0025) model time 0.5753 (0.5823) loss 9.2108 (8.1354) grad_norm 1.8421 (2.1068) loss_scale 8192.0000 (4470.2538) mem 22339MB +[2024-07-24 23:54:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][600/625] eta 0:00:14 lr 0.000987 wd 0.0500 time 0.5742 (0.5839) data time 0.0009 (0.0025) model time 0.5733 (0.5822) loss 8.4231 (8.1396) grad_norm 3.0343 (2.1060) loss_scale 8192.0000 (4532.1797) mem 22339MB +[2024-07-24 23:54:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][610/625] eta 0:00:08 lr 0.000987 wd 0.0500 time 0.5754 (0.5837) data time 0.0004 (0.0025) model time 0.5750 (0.5820) loss 7.1105 (8.1495) grad_norm 1.8439 (2.1021) loss_scale 8192.0000 (4592.0786) mem 22339MB +[2024-07-24 23:54:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [97/300][620/625] eta 0:00:02 lr 0.000987 wd 0.0500 time 0.5743 (0.5836) data time 0.0004 (0.0025) model time 0.5739 (0.5819) loss 9.8590 (8.1485) grad_norm 2.0158 (2.1031) loss_scale 8192.0000 (4650.0483) mem 22339MB +[2024-07-24 23:54:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 97 training takes 0:06:04 +[2024-07-24 23:54:40 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-24 23:54:41 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-24 23:54:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.609 (0.609) Loss 0.5518 (0.5518) Acc@1 88.574 (88.574) Acc@5 98.291 (98.291) Mem 22339MB +[2024-07-24 23:54:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.170) Loss 0.9077 (0.6856) Acc@1 78.223 (84.850) Acc@5 95.166 (97.368) Mem 22339MB +[2024-07-24 23:54:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.149) Loss 1.0039 (0.8122) Acc@1 76.318 (81.538) Acc@5 93.945 (95.938) Mem 22339MB +[2024-07-24 23:54:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.268 Acc@5 95.973 +[2024-07-24 23:54:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.3% +[2024-07-24 23:54:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 1.022 (1.022) Loss 0.5337 (0.5337) Acc@1 89.209 (89.209) Acc@5 98.584 (98.584) Mem 22339MB +[2024-07-24 23:54:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.208) Loss 0.8525 (0.6670) Acc@1 79.980 (85.702) Acc@5 95.850 (97.616) Mem 22339MB +[2024-07-24 23:54:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.169) Loss 0.9604 (0.7808) Acc@1 76.123 (82.482) Acc@5 95.068 (96.389) Mem 22339MB +[2024-07-24 23:54:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.194 Acc@5 96.405 +[2024-07-24 23:54:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.2% +[2024-07-24 23:54:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][0/625] eta 0:14:05 lr 0.000987 wd 0.0500 time 1.3520 (1.3520) data time 0.4378 (0.4378) model time 0.0000 (0.0000) loss 8.0909 (8.0909) grad_norm 1.6921 (1.6921) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:54:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][10/625] eta 0:06:35 lr 0.000987 wd 0.0500 time 0.5754 (0.6434) data time 0.0008 (0.0405) model time 0.0000 (0.0000) loss 8.5519 (7.8666) grad_norm 1.6974 (1.8064) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:55:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][20/625] eta 0:06:11 lr 0.000987 wd 0.0500 time 0.5691 (0.6139) data time 0.0007 (0.0216) model time 0.0000 (0.0000) loss 7.2751 (8.0760) grad_norm 1.7906 (1.9298) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:55:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][30/625] eta 0:05:57 lr 0.000986 wd 0.0500 time 0.5746 (0.6010) data time 0.0007 (0.0149) model time 0.0000 (0.0000) loss 7.1465 (8.0282) grad_norm 1.9863 (1.8921) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:55:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][40/625] eta 0:05:48 lr 0.000986 wd 0.0500 time 0.5829 (0.5949) data time 0.0006 (0.0114) model time 0.0000 (0.0000) loss 7.4914 (8.0149) grad_norm 3.0736 (1.8772) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:55:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][50/625] eta 0:05:39 lr 0.000986 wd 0.0500 time 0.5727 (0.5906) data time 0.0007 (0.0094) model time 0.0000 (0.0000) loss 6.5029 (8.0230) grad_norm 2.2013 (1.9885) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:55:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][60/625] eta 0:05:32 lr 0.000986 wd 0.0500 time 0.5754 (0.5880) data time 0.0009 (0.0080) model time 0.5745 (0.5734) loss 8.4438 (8.0455) grad_norm 1.5868 (2.0032) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:55:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][70/625] eta 0:05:25 lr 0.000986 wd 0.0500 time 0.5739 (0.5861) data time 0.0006 (0.0069) model time 0.5732 (0.5735) loss 7.7262 (7.9762) grad_norm 2.2648 (1.9668) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:55:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][80/625] eta 0:05:18 lr 0.000986 wd 0.0500 time 0.5736 (0.5846) data time 0.0008 (0.0062) model time 0.5727 (0.5735) loss 9.1252 (7.9847) grad_norm 1.8231 (1.9484) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:55:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][90/625] eta 0:05:12 lr 0.000986 wd 0.0500 time 0.5753 (0.5835) data time 0.0008 (0.0056) model time 0.5745 (0.5735) loss 9.1329 (7.9746) grad_norm 1.5573 (1.9622) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:55:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][100/625] eta 0:05:05 lr 0.000986 wd 0.0500 time 0.5767 (0.5828) data time 0.0008 (0.0051) model time 0.5758 (0.5739) loss 7.8594 (7.9378) grad_norm 2.2801 (1.9714) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:55:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][110/625] eta 0:04:59 lr 0.000986 wd 0.0500 time 0.5745 (0.5821) data time 0.0006 (0.0047) model time 0.5739 (0.5739) loss 7.0409 (7.9469) grad_norm 2.6460 (1.9724) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:55:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][120/625] eta 0:04:54 lr 0.000986 wd 0.0500 time 0.5757 (0.5825) data time 0.0006 (0.0044) model time 0.5751 (0.5758) loss 7.5549 (7.9302) grad_norm 1.8890 (1.9674) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:56:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][130/625] eta 0:04:51 lr 0.000986 wd 0.0500 time 0.7654 (0.5885) data time 0.0008 (0.0041) model time 0.7646 (0.5863) loss 9.0450 (7.9349) grad_norm 1.6178 (1.9456) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:56:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][140/625] eta 0:04:46 lr 0.000986 wd 0.0500 time 0.5783 (0.5909) data time 0.0009 (0.0039) model time 0.5774 (0.5903) loss 10.1786 (7.9670) grad_norm 1.7847 (1.9269) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:56:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][150/625] eta 0:04:40 lr 0.000985 wd 0.0500 time 0.5784 (0.5900) data time 0.0008 (0.0037) model time 0.5776 (0.5888) loss 8.0489 (7.9668) grad_norm 2.3208 (1.9265) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:56:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][160/625] eta 0:04:33 lr 0.000985 wd 0.0500 time 0.5739 (0.5890) data time 0.0008 (0.0035) model time 0.5731 (0.5874) loss 8.3505 (7.9903) grad_norm 1.7927 (1.9318) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:56:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][170/625] eta 0:04:27 lr 0.000985 wd 0.0500 time 0.5759 (0.5882) data time 0.0006 (0.0034) model time 0.5753 (0.5863) loss 7.5906 (7.9695) grad_norm 2.3455 (1.9306) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:56:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][180/625] eta 0:04:21 lr 0.000985 wd 0.0500 time 0.5751 (0.5874) data time 0.0006 (0.0032) model time 0.5744 (0.5853) loss 8.1585 (8.0002) grad_norm 2.2026 (1.9355) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:56:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][190/625] eta 0:04:15 lr 0.000985 wd 0.0500 time 0.5753 (0.5868) data time 0.0008 (0.0031) model time 0.5745 (0.5846) loss 8.8879 (7.9853) grad_norm 2.8740 (1.9581) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:56:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][200/625] eta 0:04:09 lr 0.000985 wd 0.0500 time 0.5769 (0.5863) data time 0.0006 (0.0030) model time 0.5763 (0.5839) loss 8.2925 (7.9977) grad_norm 1.6860 (1.9798) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:56:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][210/625] eta 0:04:03 lr 0.000985 wd 0.0500 time 0.5796 (0.5858) data time 0.0008 (0.0029) model time 0.5788 (0.5834) loss 8.4134 (8.0191) grad_norm 2.9510 (1.9995) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:56:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][220/625] eta 0:03:57 lr 0.000985 wd 0.0500 time 0.5735 (0.5852) data time 0.0006 (0.0028) model time 0.5729 (0.5828) loss 8.0116 (8.0321) grad_norm 1.7431 (2.0145) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:57:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][230/625] eta 0:03:50 lr 0.000985 wd 0.0500 time 0.5729 (0.5847) data time 0.0008 (0.0027) model time 0.5720 (0.5822) loss 9.5603 (8.0321) grad_norm 1.4957 (2.0064) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:57:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][240/625] eta 0:03:45 lr 0.000985 wd 0.0500 time 0.5730 (0.5847) data time 0.0008 (0.0026) model time 0.5722 (0.5822) loss 9.1065 (8.0518) grad_norm 2.1577 (2.0053) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:57:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][250/625] eta 0:03:39 lr 0.000985 wd 0.0500 time 0.5768 (0.5843) data time 0.0006 (0.0026) model time 0.5761 (0.5818) loss 9.1907 (8.0510) grad_norm 2.2905 (2.0098) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:57:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][260/625] eta 0:03:33 lr 0.000985 wd 0.0500 time 0.5804 (0.5839) data time 0.0006 (0.0025) model time 0.5798 (0.5814) loss 7.8708 (8.0582) grad_norm 2.0683 (2.0187) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:57:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][270/625] eta 0:03:27 lr 0.000984 wd 0.0500 time 0.5740 (0.5836) data time 0.0008 (0.0024) model time 0.5732 (0.5811) loss 8.9709 (8.0633) grad_norm 2.3583 (2.0159) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:57:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][280/625] eta 0:03:21 lr 0.000984 wd 0.0500 time 0.5783 (0.5833) data time 0.0007 (0.0024) model time 0.5775 (0.5808) loss 8.5545 (8.0594) grad_norm 2.9597 (2.0254) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:57:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][290/625] eta 0:03:15 lr 0.000984 wd 0.0500 time 0.5757 (0.5830) data time 0.0007 (0.0023) model time 0.5750 (0.5806) loss 8.3981 (8.0538) grad_norm 2.2546 (2.0234) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:57:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][300/625] eta 0:03:09 lr 0.000984 wd 0.0500 time 0.5763 (0.5828) data time 0.0008 (0.0023) model time 0.5756 (0.5803) loss 8.2367 (8.0559) grad_norm 1.8814 (2.0272) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:57:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][310/625] eta 0:03:03 lr 0.000984 wd 0.0500 time 0.5760 (0.5825) data time 0.0008 (0.0022) model time 0.5752 (0.5801) loss 7.1915 (8.0605) grad_norm 1.7493 (2.0242) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:57:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][320/625] eta 0:02:57 lr 0.000984 wd 0.0500 time 0.5746 (0.5823) data time 0.0008 (0.0022) model time 0.5738 (0.5799) loss 7.6457 (8.0454) grad_norm 2.0010 (2.0181) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:58:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][330/625] eta 0:02:51 lr 0.000984 wd 0.0500 time 0.5784 (0.5821) data time 0.0006 (0.0021) model time 0.5779 (0.5798) loss 7.8540 (8.0463) grad_norm 1.5036 (2.0321) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:58:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][340/625] eta 0:02:46 lr 0.000984 wd 0.0500 time 0.7517 (0.5827) data time 0.0008 (0.0021) model time 0.7509 (0.5805) loss 9.7376 (8.0502) grad_norm 1.9288 (2.0468) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:58:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][350/625] eta 0:02:40 lr 0.000984 wd 0.0500 time 0.7104 (0.5842) data time 0.0008 (0.0021) model time 0.7097 (0.5823) loss 9.5411 (8.0615) grad_norm 1.7381 (2.0409) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:58:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][360/625] eta 0:02:35 lr 0.000984 wd 0.0500 time 0.5718 (0.5851) data time 0.0007 (0.0020) model time 0.5711 (0.5834) loss 7.5310 (8.0505) grad_norm 2.0784 (2.0348) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:58:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][370/625] eta 0:02:29 lr 0.000984 wd 0.0500 time 0.5746 (0.5848) data time 0.0008 (0.0020) model time 0.5739 (0.5831) loss 9.3427 (8.0490) grad_norm 1.7594 (2.0351) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:58:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][380/625] eta 0:02:23 lr 0.000984 wd 0.0500 time 0.5744 (0.5845) data time 0.0008 (0.0020) model time 0.5736 (0.5827) loss 7.2670 (8.0439) grad_norm 2.1725 (2.0278) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:58:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][390/625] eta 0:02:17 lr 0.000983 wd 0.0500 time 0.5752 (0.5842) data time 0.0008 (0.0019) model time 0.5745 (0.5825) loss 6.7465 (8.0337) grad_norm 2.9315 (2.0334) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:58:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][400/625] eta 0:02:11 lr 0.000983 wd 0.0500 time 0.5735 (0.5840) data time 0.0006 (0.0019) model time 0.5729 (0.5822) loss 8.3759 (8.0441) grad_norm 2.0126 (2.0382) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:58:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][410/625] eta 0:02:05 lr 0.000983 wd 0.0500 time 0.5785 (0.5838) data time 0.0009 (0.0019) model time 0.5776 (0.5820) loss 8.0917 (8.0439) grad_norm 1.6188 (2.0432) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:58:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][420/625] eta 0:01:59 lr 0.000983 wd 0.0500 time 0.5775 (0.5836) data time 0.0006 (0.0019) model time 0.5770 (0.5818) loss 6.6720 (8.0543) grad_norm 2.7579 (2.0497) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:59:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][430/625] eta 0:01:53 lr 0.000983 wd 0.0500 time 0.5726 (0.5834) data time 0.0007 (0.0018) model time 0.5719 (0.5816) loss 8.7164 (8.0443) grad_norm 2.4161 (2.0576) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:59:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][440/625] eta 0:01:47 lr 0.000983 wd 0.0500 time 0.5744 (0.5832) data time 0.0008 (0.0018) model time 0.5736 (0.5814) loss 8.8638 (8.0509) grad_norm 2.8182 (2.0713) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:59:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][450/625] eta 0:01:42 lr 0.000983 wd 0.0500 time 0.5743 (0.5830) data time 0.0006 (0.0018) model time 0.5737 (0.5812) loss 7.0742 (8.0480) grad_norm 2.0409 (2.0772) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:59:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][460/625] eta 0:01:36 lr 0.000983 wd 0.0500 time 0.6803 (0.5830) data time 0.0008 (0.0018) model time 0.6795 (0.5813) loss 8.7853 (8.0500) grad_norm 1.7727 (2.0767) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:59:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][470/625] eta 0:01:30 lr 0.000983 wd 0.0500 time 0.5772 (0.5827) data time 0.0008 (0.0018) model time 0.5764 (0.5810) loss 7.5298 (8.0619) grad_norm 1.3973 (2.0701) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:59:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][480/625] eta 0:01:24 lr 0.000983 wd 0.0500 time 0.5772 (0.5826) data time 0.0006 (0.0017) model time 0.5766 (0.5808) loss 7.9151 (8.0657) grad_norm 1.7339 (2.0620) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:59:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][490/625] eta 0:01:18 lr 0.000983 wd 0.0500 time 0.5747 (0.5825) data time 0.0008 (0.0017) model time 0.5739 (0.5807) loss 8.0447 (8.0613) grad_norm 1.7408 (2.0603) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:59:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][500/625] eta 0:01:12 lr 0.000983 wd 0.0500 time 0.5753 (0.5824) data time 0.0006 (0.0017) model time 0.5747 (0.5806) loss 7.0097 (8.0558) grad_norm 1.9133 (2.0572) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:59:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][510/625] eta 0:01:06 lr 0.000982 wd 0.0500 time 0.5871 (0.5823) data time 0.0006 (0.0017) model time 0.5865 (0.5805) loss 7.4679 (8.0553) grad_norm 2.6898 (2.0619) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:59:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][520/625] eta 0:01:01 lr 0.000982 wd 0.0500 time 0.5733 (0.5821) data time 0.0008 (0.0017) model time 0.5725 (0.5803) loss 7.5953 (8.0499) grad_norm 1.8242 (2.0698) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-24 23:59:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][530/625] eta 0:00:55 lr 0.000982 wd 0.0500 time 0.5750 (0.5820) data time 0.0008 (0.0016) model time 0.5742 (0.5802) loss 8.1440 (8.0503) grad_norm 2.8870 (2.0760) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:00:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][540/625] eta 0:00:49 lr 0.000982 wd 0.0500 time 0.5756 (0.5820) data time 0.0006 (0.0016) model time 0.5750 (0.5803) loss 8.5148 (8.0457) grad_norm 1.6163 (2.0754) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:00:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][550/625] eta 0:00:43 lr 0.000982 wd 0.0500 time 0.5763 (0.5821) data time 0.0008 (0.0016) model time 0.5755 (0.5804) loss 8.1265 (8.0403) grad_norm 2.5608 (2.0816) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:00:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][560/625] eta 0:00:37 lr 0.000982 wd 0.0500 time 0.7351 (0.5823) data time 0.0006 (0.0016) model time 0.7345 (0.5806) loss 6.3621 (8.0286) grad_norm 2.0068 (2.0879) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:00:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][570/625] eta 0:00:32 lr 0.000982 wd 0.0500 time 0.5729 (0.5830) data time 0.0006 (0.0016) model time 0.5723 (0.5815) loss 6.7235 (8.0333) grad_norm 2.1473 (2.0898) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:00:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][580/625] eta 0:00:26 lr 0.000982 wd 0.0500 time 0.5735 (0.5840) data time 0.0007 (0.0016) model time 0.5729 (0.5826) loss 7.8516 (8.0263) grad_norm 1.6665 (2.0857) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:00:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][590/625] eta 0:00:20 lr 0.000982 wd 0.0500 time 0.5796 (0.5839) data time 0.0006 (0.0016) model time 0.5790 (0.5824) loss 8.1734 (8.0273) grad_norm 1.4878 (2.0809) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:00:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][600/625] eta 0:00:14 lr 0.000982 wd 0.0500 time 0.5732 (0.5838) data time 0.0006 (0.0016) model time 0.5725 (0.5823) loss 7.1892 (8.0280) grad_norm 1.9448 (2.0766) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:00:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][610/625] eta 0:00:08 lr 0.000982 wd 0.0500 time 0.5765 (0.5836) data time 0.0006 (0.0016) model time 0.5759 (0.5822) loss 6.9425 (8.0226) grad_norm 2.0735 (2.0762) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:00:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [98/300][620/625] eta 0:00:02 lr 0.000982 wd 0.0500 time 0.5736 (0.5835) data time 0.0004 (0.0015) model time 0.5732 (0.5820) loss 7.6059 (8.0200) grad_norm 2.0984 (2.0793) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:00:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 98 training takes 0:06:04 +[2024-07-25 00:00:53 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 00:00:55 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 00:00:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.467 (0.467) Loss 0.5435 (0.5435) Acc@1 88.086 (88.086) Acc@5 98.291 (98.291) Mem 22339MB +[2024-07-25 00:00:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.157) Loss 0.9062 (0.6809) Acc@1 78.369 (84.881) Acc@5 95.361 (97.350) Mem 22339MB +[2024-07-25 00:00:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 1.0449 (0.8098) Acc@1 73.975 (81.445) Acc@5 93.652 (95.922) Mem 22339MB +[2024-07-25 00:00:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.220 Acc@5 95.915 +[2024-07-25 00:00:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.2% +[2024-07-25 00:00:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.761 (0.761) Loss 0.5312 (0.5312) Acc@1 89.160 (89.160) Acc@5 98.584 (98.584) Mem 22339MB +[2024-07-25 00:01:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.183) Loss 0.8477 (0.6642) Acc@1 79.932 (85.729) Acc@5 95.947 (97.625) Mem 22339MB +[2024-07-25 00:01:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.156) Loss 0.9565 (0.7780) Acc@1 76.172 (82.487) Acc@5 95.068 (96.405) Mem 22339MB +[2024-07-25 00:01:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.194 Acc@5 96.419 +[2024-07-25 00:01:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.2% +[2024-07-25 00:01:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][0/625] eta 0:15:02 lr 0.000982 wd 0.0500 time 1.4432 (1.4432) data time 0.6875 (0.6875) model time 0.0000 (0.0000) loss 8.8904 (8.8904) grad_norm 2.2265 (2.2265) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:01:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][10/625] eta 0:06:43 lr 0.000981 wd 0.0500 time 0.5749 (0.6555) data time 0.0006 (0.0632) model time 0.0000 (0.0000) loss 8.2525 (8.2522) grad_norm 2.1752 (2.3944) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:01:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][20/625] eta 0:06:19 lr 0.000981 wd 0.0500 time 0.7973 (0.6279) data time 0.0008 (0.0335) model time 0.0000 (0.0000) loss 8.5287 (8.1216) grad_norm 2.5104 (2.4886) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:01:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][30/625] eta 0:06:47 lr 0.000981 wd 0.0500 time 2.9503 (0.6855) data time 0.0006 (0.0230) model time 0.0000 (0.0000) loss 7.2487 (7.9841) grad_norm 2.2242 (2.5086) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:01:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][40/625] eta 0:06:26 lr 0.000981 wd 0.0500 time 0.5758 (0.6611) data time 0.0006 (0.0190) model time 0.0000 (0.0000) loss 8.9515 (7.9621) grad_norm 2.3335 (2.4922) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:01:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][50/625] eta 0:06:10 lr 0.000981 wd 0.0500 time 0.5816 (0.6449) data time 0.0006 (0.0154) model time 0.0000 (0.0000) loss 7.8646 (7.9053) grad_norm 2.3940 (2.4472) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:01:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][60/625] eta 0:05:58 lr 0.000981 wd 0.0500 time 0.5865 (0.6340) data time 0.0008 (0.0130) model time 0.5857 (0.5777) loss 7.8318 (7.9714) grad_norm 2.2571 (2.4003) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:01:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][70/625] eta 0:05:47 lr 0.000981 wd 0.0500 time 0.5845 (0.6264) data time 0.0007 (0.0113) model time 0.5839 (0.5786) loss 9.1452 (8.0317) grad_norm 2.0056 (2.3276) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:01:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][80/625] eta 0:05:38 lr 0.000981 wd 0.0500 time 0.5866 (0.6210) data time 0.0008 (0.0101) model time 0.5857 (0.5794) loss 8.9093 (8.0605) grad_norm 1.9527 (2.2506) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:01:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][90/625] eta 0:05:29 lr 0.000981 wd 0.0500 time 0.5803 (0.6163) data time 0.0008 (0.0091) model time 0.5795 (0.5789) loss 7.8822 (8.0003) grad_norm 1.8820 (2.2524) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:02:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][100/625] eta 0:05:21 lr 0.000981 wd 0.0500 time 0.5849 (0.6125) data time 0.0006 (0.0083) model time 0.5842 (0.5786) loss 8.2679 (7.9793) grad_norm 1.6256 (2.2289) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:02:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][110/625] eta 0:05:14 lr 0.000981 wd 0.0500 time 0.5819 (0.6098) data time 0.0006 (0.0076) model time 0.5813 (0.5791) loss 9.1573 (7.9771) grad_norm 1.8581 (2.2010) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:02:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][120/625] eta 0:05:06 lr 0.000981 wd 0.0500 time 0.5803 (0.6072) data time 0.0006 (0.0070) model time 0.5797 (0.5789) loss 7.9234 (7.9737) grad_norm 2.1759 (2.1872) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:02:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][130/625] eta 0:04:59 lr 0.000980 wd 0.0500 time 0.5881 (0.6050) data time 0.0008 (0.0066) model time 0.5873 (0.5786) loss 8.2037 (8.0107) grad_norm 1.6343 (2.1666) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:02:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][140/625] eta 0:04:52 lr 0.000980 wd 0.0500 time 0.5925 (0.6031) data time 0.0008 (0.0062) model time 0.5917 (0.5784) loss 8.2218 (8.0337) grad_norm 1.7595 (2.1448) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:02:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][150/625] eta 0:04:45 lr 0.000980 wd 0.0500 time 0.5915 (0.6017) data time 0.0008 (0.0058) model time 0.5907 (0.5787) loss 8.1752 (8.0196) grad_norm 1.5633 (2.1206) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:02:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][160/625] eta 0:04:40 lr 0.000980 wd 0.0500 time 0.7433 (0.6031) data time 0.0008 (0.0055) model time 0.7426 (0.5828) loss 8.3150 (8.0143) grad_norm 2.1478 (2.1253) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:02:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][170/625] eta 0:04:35 lr 0.000980 wd 0.0500 time 0.7790 (0.6060) data time 0.0007 (0.0053) model time 0.7782 (0.5886) loss 8.0646 (8.0052) grad_norm 1.4643 (2.1206) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:02:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][180/625] eta 0:04:29 lr 0.000980 wd 0.0500 time 0.6015 (0.6064) data time 0.0008 (0.0050) model time 0.6008 (0.5903) loss 8.3771 (8.0116) grad_norm 1.9116 (2.1154) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:02:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][190/625] eta 0:04:23 lr 0.000980 wd 0.0500 time 0.5825 (0.6049) data time 0.0008 (0.0048) model time 0.5817 (0.5894) loss 6.7785 (7.9957) grad_norm 1.6140 (2.1394) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:03:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][200/625] eta 0:04:16 lr 0.000980 wd 0.0500 time 0.5784 (0.6035) data time 0.0008 (0.0046) model time 0.5776 (0.5886) loss 9.7621 (8.0357) grad_norm 2.6029 (2.1321) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:03:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][210/625] eta 0:04:10 lr 0.000980 wd 0.0500 time 0.5960 (0.6026) data time 0.0006 (0.0046) model time 0.5953 (0.5880) loss 7.9238 (8.0369) grad_norm 1.7729 (2.1192) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:03:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][220/625] eta 0:04:03 lr 0.000980 wd 0.0500 time 0.5823 (0.6016) data time 0.0009 (0.0045) model time 0.5814 (0.5875) loss 7.9434 (8.0217) grad_norm 2.2862 (2.1056) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:03:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][230/625] eta 0:03:57 lr 0.000980 wd 0.0500 time 0.5812 (0.6006) data time 0.0008 (0.0043) model time 0.5804 (0.5869) loss 9.0384 (8.0347) grad_norm 3.4816 (2.1055) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:03:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][240/625] eta 0:03:50 lr 0.000980 wd 0.0500 time 0.5385 (0.5999) data time 0.0006 (0.0042) model time 0.5378 (0.5867) loss 8.0736 (8.0394) grad_norm 1.7135 (2.1029) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:03:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][250/625] eta 0:03:44 lr 0.000979 wd 0.0500 time 0.5785 (0.5992) data time 0.0008 (0.0041) model time 0.5777 (0.5864) loss 7.9263 (8.0566) grad_norm 3.0221 (2.1027) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:03:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][260/625] eta 0:03:38 lr 0.000979 wd 0.0500 time 0.6052 (0.5985) data time 0.0008 (0.0039) model time 0.6044 (0.5862) loss 8.3998 (8.0539) grad_norm 1.6111 (2.0862) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:03:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][270/625] eta 0:03:32 lr 0.000979 wd 0.0500 time 0.5836 (0.5979) data time 0.0005 (0.0038) model time 0.5831 (0.5859) loss 6.6152 (8.0433) grad_norm 1.7149 (2.0736) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:03:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][280/625] eta 0:03:26 lr 0.000979 wd 0.0500 time 0.5806 (0.5972) data time 0.0007 (0.0037) model time 0.5799 (0.5856) loss 6.8986 (8.0241) grad_norm 1.9011 (2.0679) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:03:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][290/625] eta 0:03:19 lr 0.000979 wd 0.0500 time 0.5896 (0.5966) data time 0.0007 (0.0036) model time 0.5889 (0.5853) loss 8.2578 (8.0254) grad_norm 1.8954 (2.0736) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:04:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][300/625] eta 0:03:13 lr 0.000979 wd 0.0500 time 0.5847 (0.5961) data time 0.0006 (0.0035) model time 0.5841 (0.5851) loss 8.4809 (8.0203) grad_norm 1.7558 (2.0677) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:04:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][310/625] eta 0:03:07 lr 0.000979 wd 0.0500 time 0.5946 (0.5956) data time 0.0006 (0.0034) model time 0.5940 (0.5849) loss 8.7905 (8.0241) grad_norm 1.8770 (2.0755) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:04:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][320/625] eta 0:03:01 lr 0.000979 wd 0.0500 time 0.5908 (0.5951) data time 0.0009 (0.0034) model time 0.5899 (0.5846) loss 7.3055 (8.0301) grad_norm 1.4560 (2.0780) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:04:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][330/625] eta 0:02:55 lr 0.000979 wd 0.0500 time 0.5865 (0.5947) data time 0.0008 (0.0033) model time 0.5856 (0.5845) loss 8.6682 (8.0356) grad_norm 1.6446 (2.0638) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:04:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][340/625] eta 0:02:49 lr 0.000979 wd 0.0500 time 0.5819 (0.5942) data time 0.0006 (0.0032) model time 0.5813 (0.5842) loss 8.0198 (8.0315) grad_norm 1.9454 (2.0547) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:04:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][350/625] eta 0:02:43 lr 0.000979 wd 0.0500 time 0.5846 (0.5937) data time 0.0006 (0.0031) model time 0.5840 (0.5840) loss 7.4262 (8.0392) grad_norm 1.8934 (2.0515) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:04:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][360/625] eta 0:02:37 lr 0.000979 wd 0.0500 time 0.5852 (0.5933) data time 0.0007 (0.0031) model time 0.5845 (0.5838) loss 7.3477 (8.0442) grad_norm 2.1614 (2.0462) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:04:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][370/625] eta 0:02:31 lr 0.000978 wd 0.0500 time 0.5823 (0.5930) data time 0.0008 (0.0031) model time 0.5815 (0.5836) loss 7.8585 (8.0391) grad_norm 2.7042 (2.0555) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:04:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][380/625] eta 0:02:25 lr 0.000978 wd 0.0500 time 0.7417 (0.5938) data time 0.0008 (0.0030) model time 0.7409 (0.5848) loss 7.2392 (8.0421) grad_norm 1.7376 (2.0544) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:04:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][390/625] eta 0:02:19 lr 0.000978 wd 0.0500 time 0.7417 (0.5953) data time 0.0008 (0.0030) model time 0.7409 (0.5867) loss 7.4126 (8.0525) grad_norm 2.5026 (2.0566) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:05:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][400/625] eta 0:02:13 lr 0.000978 wd 0.0500 time 0.5840 (0.5955) data time 0.0007 (0.0029) model time 0.5833 (0.5872) loss 9.6768 (8.0533) grad_norm 2.5949 (2.0604) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:05:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][410/625] eta 0:02:07 lr 0.000978 wd 0.0500 time 0.5786 (0.5951) data time 0.0008 (0.0029) model time 0.5778 (0.5869) loss 8.7577 (8.0450) grad_norm 1.5084 (2.0560) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:05:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][420/625] eta 0:02:01 lr 0.000978 wd 0.0500 time 0.5804 (0.5947) data time 0.0006 (0.0029) model time 0.5798 (0.5866) loss 9.2780 (8.0488) grad_norm 2.1574 (2.0528) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:05:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][430/625] eta 0:01:55 lr 0.000978 wd 0.0500 time 0.5779 (0.5943) data time 0.0008 (0.0028) model time 0.5771 (0.5864) loss 8.7138 (8.0385) grad_norm 2.9366 (2.0552) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:05:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][440/625] eta 0:01:49 lr 0.000978 wd 0.0500 time 0.5840 (0.5940) data time 0.0006 (0.0028) model time 0.5834 (0.5862) loss 7.9329 (8.0387) grad_norm 1.5287 (2.0573) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:05:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][450/625] eta 0:01:43 lr 0.000978 wd 0.0500 time 0.5988 (0.5936) data time 0.0008 (0.0027) model time 0.5981 (0.5860) loss 9.1379 (8.0438) grad_norm 2.1586 (2.0682) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:05:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][460/625] eta 0:01:37 lr 0.000978 wd 0.0500 time 0.6882 (0.5935) data time 0.0008 (0.0027) model time 0.6874 (0.5861) loss 8.7296 (8.0473) grad_norm 3.2134 (2.0688) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:05:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][470/625] eta 0:01:31 lr 0.000978 wd 0.0500 time 0.5855 (0.5932) data time 0.0007 (0.0026) model time 0.5848 (0.5858) loss 8.3426 (8.0453) grad_norm 1.6712 (2.0660) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:05:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][480/625] eta 0:01:25 lr 0.000978 wd 0.0500 time 0.5824 (0.5929) data time 0.0008 (0.0026) model time 0.5816 (0.5856) loss 6.2495 (8.0377) grad_norm 2.0027 (2.0687) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:05:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][490/625] eta 0:01:19 lr 0.000977 wd 0.0500 time 0.5837 (0.5926) data time 0.0008 (0.0026) model time 0.5829 (0.5854) loss 8.0577 (8.0324) grad_norm 1.5676 (2.0690) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:05:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][500/625] eta 0:01:14 lr 0.000977 wd 0.0500 time 0.5835 (0.5923) data time 0.0006 (0.0026) model time 0.5829 (0.5852) loss 7.1114 (8.0307) grad_norm 1.9905 (2.0797) loss_scale 8192.0000 (8192.0000) mem 22339MB +[2024-07-25 00:06:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][510/625] eta 0:01:08 lr 0.000977 wd 0.0500 time 0.5855 (0.5920) data time 0.0006 (0.0025) model time 0.5849 (0.5850) loss 6.7953 (8.0329) grad_norm 2.2531 (inf) loss_scale 4096.0000 (8127.8748) mem 22339MB +[2024-07-25 00:06:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][520/625] eta 0:01:02 lr 0.000977 wd 0.0500 time 0.5827 (0.5917) data time 0.0008 (0.0025) model time 0.5819 (0.5849) loss 8.9979 (8.0248) grad_norm 1.6614 (inf) loss_scale 4096.0000 (8050.4875) mem 22339MB +[2024-07-25 00:06:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][530/625] eta 0:00:56 lr 0.000977 wd 0.0500 time 0.5823 (0.5915) data time 0.0008 (0.0025) model time 0.5816 (0.5847) loss 9.7717 (8.0317) grad_norm 1.7109 (inf) loss_scale 4096.0000 (7976.0151) mem 22339MB +[2024-07-25 00:06:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][540/625] eta 0:00:50 lr 0.000977 wd 0.0500 time 0.5763 (0.5913) data time 0.0006 (0.0025) model time 0.5757 (0.5846) loss 8.7168 (8.0297) grad_norm 1.6386 (inf) loss_scale 4096.0000 (7904.2957) mem 22339MB +[2024-07-25 00:06:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][550/625] eta 0:00:44 lr 0.000977 wd 0.0500 time 0.5775 (0.5910) data time 0.0008 (0.0024) model time 0.5768 (0.5844) loss 7.8426 (8.0296) grad_norm 1.8293 (inf) loss_scale 4096.0000 (7835.1797) mem 22339MB +[2024-07-25 00:06:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][560/625] eta 0:00:38 lr 0.000977 wd 0.0500 time 0.5838 (0.5908) data time 0.0008 (0.0024) model time 0.5830 (0.5843) loss 7.4681 (8.0306) grad_norm 1.6555 (inf) loss_scale 4096.0000 (7768.5276) mem 22339MB +[2024-07-25 00:06:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][570/625] eta 0:00:32 lr 0.000977 wd 0.0500 time 0.5850 (0.5906) data time 0.0006 (0.0024) model time 0.5844 (0.5842) loss 9.1212 (8.0347) grad_norm 1.5715 (inf) loss_scale 4096.0000 (7704.2102) mem 22339MB +[2024-07-25 00:06:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][580/625] eta 0:00:26 lr 0.000977 wd 0.0500 time 0.5819 (0.5904) data time 0.0008 (0.0023) model time 0.5812 (0.5840) loss 7.6308 (8.0417) grad_norm 2.0302 (inf) loss_scale 4096.0000 (7642.1067) mem 22339MB +[2024-07-25 00:06:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][590/625] eta 0:00:20 lr 0.000977 wd 0.0500 time 0.5865 (0.5902) data time 0.0008 (0.0023) model time 0.5858 (0.5839) loss 7.8803 (8.0287) grad_norm 1.9517 (inf) loss_scale 4096.0000 (7582.1049) mem 22339MB +[2024-07-25 00:06:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][600/625] eta 0:00:14 lr 0.000977 wd 0.0500 time 0.5845 (0.5905) data time 0.0006 (0.0023) model time 0.5839 (0.5844) loss 6.0871 (8.0344) grad_norm 3.5404 (inf) loss_scale 4096.0000 (7524.0998) mem 22339MB +[2024-07-25 00:07:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][610/625] eta 0:00:08 lr 0.000976 wd 0.0500 time 0.7503 (0.5916) data time 0.0004 (0.0023) model time 0.7499 (0.5857) loss 9.4699 (8.0361) grad_norm 1.9023 (inf) loss_scale 4096.0000 (7467.9935) mem 22339MB +[2024-07-25 00:07:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [99/300][620/625] eta 0:00:02 lr 0.000976 wd 0.0500 time 0.5816 (0.5916) data time 0.0004 (0.0022) model time 0.5812 (0.5857) loss 8.5697 (8.0382) grad_norm 2.6606 (inf) loss_scale 4096.0000 (7413.6940) mem 22339MB +[2024-07-25 00:07:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 99 training takes 0:06:09 +[2024-07-25 00:07:12 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 00:07:13 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 00:07:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.483 (0.483) Loss 0.5469 (0.5469) Acc@1 88.867 (88.867) Acc@5 98.193 (98.193) Mem 22339MB +[2024-07-25 00:07:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.9272 (0.7053) Acc@1 79.004 (84.979) Acc@5 95.410 (97.283) Mem 22339MB +[2024-07-25 00:07:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 1.0371 (0.8302) Acc@1 75.684 (81.571) Acc@5 94.385 (95.908) Mem 22339MB +[2024-07-25 00:07:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.320 Acc@5 95.907 +[2024-07-25 00:07:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.3% +[2024-07-25 00:07:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.842 (0.842) Loss 0.5283 (0.5283) Acc@1 89.307 (89.307) Acc@5 98.584 (98.584) Mem 22339MB +[2024-07-25 00:07:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.191) Loss 0.8452 (0.6617) Acc@1 79.883 (85.733) Acc@5 95.947 (97.630) Mem 22339MB +[2024-07-25 00:07:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.160) Loss 0.9556 (0.7753) Acc@1 76.270 (82.510) Acc@5 95.117 (96.410) Mem 22339MB +[2024-07-25 00:07:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.218 Acc@5 96.429 +[2024-07-25 00:07:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.2% +[2024-07-25 00:07:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.22% +[2024-07-25 00:07:20 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 00:07:22 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 00:07:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][0/625] eta 0:08:55 lr 0.000976 wd 0.0500 time 0.8569 (0.8569) data time 0.3381 (0.3381) model time 0.0000 (0.0000) loss 8.4764 (8.4764) grad_norm 1.6397 (1.6397) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:07:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][10/625] eta 0:06:09 lr 0.000976 wd 0.0500 time 0.5648 (0.6009) data time 0.0007 (0.0315) model time 0.0000 (0.0000) loss 9.4588 (8.0422) grad_norm 4.6874 (2.6795) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:07:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][20/625] eta 0:05:57 lr 0.000976 wd 0.0500 time 0.5738 (0.5901) data time 0.0006 (0.0170) model time 0.0000 (0.0000) loss 7.8573 (8.1045) grad_norm 2.5284 (2.4334) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:07:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][30/625] eta 0:05:48 lr 0.000976 wd 0.0500 time 0.5618 (0.5855) data time 0.0006 (0.0118) model time 0.0000 (0.0000) loss 9.9370 (8.0757) grad_norm 1.7314 (2.3165) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:07:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][40/625] eta 0:05:41 lr 0.000976 wd 0.0500 time 0.5630 (0.5837) data time 0.0006 (0.0092) model time 0.0000 (0.0000) loss 9.0721 (8.0934) grad_norm 1.8747 (2.2108) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:07:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][50/625] eta 0:05:34 lr 0.000976 wd 0.0500 time 0.5619 (0.5824) data time 0.0008 (0.0076) model time 0.0000 (0.0000) loss 9.2939 (8.2405) grad_norm 1.7209 (2.2186) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:07:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][60/625] eta 0:05:28 lr 0.000976 wd 0.0500 time 0.5634 (0.5815) data time 0.0006 (0.0065) model time 0.5628 (0.5756) loss 9.8449 (8.2922) grad_norm 2.7727 (2.2814) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:08:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][70/625] eta 0:05:22 lr 0.000976 wd 0.0500 time 0.5707 (0.5809) data time 0.0006 (0.0057) model time 0.5701 (0.5761) loss 7.5248 (8.2325) grad_norm 2.2013 (2.3096) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:08:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][80/625] eta 0:05:16 lr 0.000976 wd 0.0500 time 0.5691 (0.5805) data time 0.0007 (0.0051) model time 0.5684 (0.5763) loss 9.3143 (8.2215) grad_norm 1.8078 (2.2536) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:08:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][90/625] eta 0:05:10 lr 0.000976 wd 0.0500 time 0.5730 (0.5801) data time 0.0006 (0.0046) model time 0.5725 (0.5762) loss 7.0399 (8.1800) grad_norm 1.7316 (2.2250) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:08:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][100/625] eta 0:05:04 lr 0.000976 wd 0.0500 time 0.5643 (0.5798) data time 0.0006 (0.0043) model time 0.5636 (0.5763) loss 7.4826 (8.1696) grad_norm 1.5954 (2.1747) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:08:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][110/625] eta 0:04:58 lr 0.000975 wd 0.0500 time 0.5647 (0.5797) data time 0.0006 (0.0040) model time 0.5641 (0.5766) loss 8.4809 (8.1724) grad_norm 1.6850 (2.1386) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:08:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][120/625] eta 0:04:52 lr 0.000975 wd 0.0500 time 0.5650 (0.5796) data time 0.0008 (0.0037) model time 0.5642 (0.5766) loss 9.5648 (8.2038) grad_norm 1.5575 (2.1169) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:08:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][130/625] eta 0:04:46 lr 0.000975 wd 0.0500 time 0.5666 (0.5794) data time 0.0006 (0.0035) model time 0.5660 (0.5766) loss 8.7262 (8.2091) grad_norm 2.6087 (2.1572) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:08:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][140/625] eta 0:04:40 lr 0.000975 wd 0.0500 time 0.5629 (0.5792) data time 0.0007 (0.0033) model time 0.5622 (0.5765) loss 9.7810 (8.2021) grad_norm 1.7725 (2.1643) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:08:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][150/625] eta 0:04:35 lr 0.000975 wd 0.0500 time 0.5642 (0.5791) data time 0.0008 (0.0031) model time 0.5634 (0.5765) loss 6.5742 (8.1863) grad_norm 1.9688 (2.1570) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:08:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][160/625] eta 0:04:29 lr 0.000975 wd 0.0500 time 0.5630 (0.5792) data time 0.0007 (0.0030) model time 0.5622 (0.5768) loss 6.8181 (8.1552) grad_norm 2.7905 (2.1734) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:09:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][170/625] eta 0:04:23 lr 0.000975 wd 0.0500 time 0.5638 (0.5791) data time 0.0006 (0.0029) model time 0.5631 (0.5767) loss 8.4765 (8.1734) grad_norm 2.0970 (2.1710) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:09:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][180/625] eta 0:04:17 lr 0.000975 wd 0.0500 time 0.5658 (0.5793) data time 0.0009 (0.0028) model time 0.5649 (0.5772) loss 7.3774 (8.1417) grad_norm 3.6165 (2.1919) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:09:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][190/625] eta 0:04:12 lr 0.000975 wd 0.0500 time 0.7198 (0.5802) data time 0.0006 (0.0027) model time 0.7192 (0.5785) loss 7.3409 (8.1270) grad_norm 1.3970 (2.1932) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:09:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][200/625] eta 0:04:08 lr 0.000975 wd 0.0500 time 0.7410 (0.5838) data time 0.0006 (0.0026) model time 0.7404 (0.5833) loss 9.1531 (8.1100) grad_norm 1.6041 (2.1726) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:09:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][210/625] eta 0:04:03 lr 0.000975 wd 0.0500 time 0.5645 (0.5867) data time 0.0006 (0.0025) model time 0.5639 (0.5872) loss 6.7093 (8.0870) grad_norm 1.7790 (2.1713) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:09:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][220/625] eta 0:03:57 lr 0.000975 wd 0.0500 time 0.5620 (0.5867) data time 0.0007 (0.0025) model time 0.5613 (0.5870) loss 6.6532 (8.0722) grad_norm 2.0245 (2.1645) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:09:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][230/625] eta 0:03:51 lr 0.000974 wd 0.0500 time 0.5641 (0.5865) data time 0.0006 (0.0024) model time 0.5634 (0.5868) loss 7.3331 (8.0519) grad_norm 2.1045 (2.1690) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:09:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][240/625] eta 0:03:45 lr 0.000974 wd 0.0500 time 0.5626 (0.5863) data time 0.0006 (0.0023) model time 0.5620 (0.5864) loss 8.9789 (8.0494) grad_norm 1.6334 (2.1686) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:09:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][250/625] eta 0:03:39 lr 0.000974 wd 0.0500 time 0.5630 (0.5862) data time 0.0008 (0.0023) model time 0.5622 (0.5863) loss 8.8396 (8.0329) grad_norm 2.1354 (2.1658) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:09:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][260/625] eta 0:03:34 lr 0.000974 wd 0.0500 time 0.5646 (0.5864) data time 0.0008 (0.0022) model time 0.5638 (0.5864) loss 9.2799 (8.0496) grad_norm 1.9628 (2.1572) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:10:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][270/625] eta 0:03:28 lr 0.000974 wd 0.0500 time 0.5664 (0.5861) data time 0.0006 (0.0022) model time 0.5658 (0.5861) loss 7.3682 (8.0372) grad_norm 2.2842 (2.1499) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:10:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][280/625] eta 0:03:22 lr 0.000974 wd 0.0500 time 0.5676 (0.5858) data time 0.0007 (0.0021) model time 0.5669 (0.5857) loss 6.2223 (8.0319) grad_norm 1.7218 (2.1378) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:10:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][290/625] eta 0:03:16 lr 0.000974 wd 0.0500 time 0.5623 (0.5856) data time 0.0006 (0.0021) model time 0.5617 (0.5853) loss 10.6076 (8.0422) grad_norm 2.9576 (2.1334) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:10:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][300/625] eta 0:03:10 lr 0.000974 wd 0.0500 time 0.5671 (0.5853) data time 0.0007 (0.0021) model time 0.5665 (0.5849) loss 8.8988 (8.0425) grad_norm 1.7226 (2.1382) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:10:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][310/625] eta 0:03:04 lr 0.000974 wd 0.0500 time 0.5616 (0.5851) data time 0.0006 (0.0021) model time 0.5611 (0.5846) loss 9.4374 (8.0330) grad_norm 1.8523 (2.1486) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:10:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][320/625] eta 0:02:58 lr 0.000974 wd 0.0500 time 0.5697 (0.5849) data time 0.0008 (0.0020) model time 0.5689 (0.5844) loss 8.2942 (8.0340) grad_norm 1.9107 (2.1388) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:10:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][330/625] eta 0:02:52 lr 0.000974 wd 0.0500 time 0.5646 (0.5847) data time 0.0006 (0.0020) model time 0.5641 (0.5841) loss 7.0020 (8.0283) grad_norm 2.1195 (2.1249) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:10:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][340/625] eta 0:02:46 lr 0.000974 wd 0.0500 time 0.5651 (0.5845) data time 0.0006 (0.0020) model time 0.5645 (0.5839) loss 8.5746 (8.0424) grad_norm 2.1067 (2.1159) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:10:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][350/625] eta 0:02:40 lr 0.000973 wd 0.0500 time 0.5611 (0.5843) data time 0.0008 (0.0019) model time 0.5603 (0.5836) loss 6.5585 (8.0388) grad_norm 2.7057 (2.1145) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:10:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][360/625] eta 0:02:34 lr 0.000973 wd 0.0500 time 0.5635 (0.5841) data time 0.0007 (0.0019) model time 0.5628 (0.5834) loss 8.6218 (8.0213) grad_norm 1.8309 (2.1124) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:10:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][370/625] eta 0:02:28 lr 0.000973 wd 0.0500 time 0.5671 (0.5839) data time 0.0006 (0.0019) model time 0.5665 (0.5831) loss 8.6606 (8.0232) grad_norm 1.8363 (2.1125) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:11:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][380/625] eta 0:02:23 lr 0.000973 wd 0.0500 time 0.5662 (0.5837) data time 0.0006 (0.0019) model time 0.5656 (0.5829) loss 7.2402 (8.0175) grad_norm 1.7293 (2.1086) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:11:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][390/625] eta 0:02:17 lr 0.000973 wd 0.0500 time 0.5640 (0.5835) data time 0.0006 (0.0018) model time 0.5634 (0.5827) loss 7.5235 (8.0242) grad_norm 1.6420 (2.0997) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:11:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][400/625] eta 0:02:11 lr 0.000973 wd 0.0500 time 0.5679 (0.5834) data time 0.0006 (0.0018) model time 0.5673 (0.5826) loss 6.3178 (8.0231) grad_norm 2.0611 (2.0922) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:11:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][410/625] eta 0:02:05 lr 0.000973 wd 0.0500 time 0.5643 (0.5837) data time 0.0006 (0.0018) model time 0.5637 (0.5829) loss 8.4463 (8.0288) grad_norm 2.2135 (2.0924) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:11:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][420/625] eta 0:02:00 lr 0.000973 wd 0.0500 time 0.7311 (0.5854) data time 0.0007 (0.0018) model time 0.7305 (0.5849) loss 6.1537 (8.0168) grad_norm 1.8414 (2.0952) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:11:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][430/625] eta 0:01:54 lr 0.000973 wd 0.0500 time 0.5501 (0.5871) data time 0.0009 (0.0017) model time 0.5492 (0.5868) loss 7.7023 (8.0071) grad_norm 1.8807 (2.0899) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:11:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][440/625] eta 0:01:48 lr 0.000973 wd 0.0500 time 0.5659 (0.5870) data time 0.0006 (0.0017) model time 0.5654 (0.5867) loss 7.0782 (8.0092) grad_norm 1.5377 (2.0893) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:11:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][450/625] eta 0:01:42 lr 0.000973 wd 0.0500 time 0.5631 (0.5871) data time 0.0007 (0.0017) model time 0.5623 (0.5868) loss 6.8279 (8.0164) grad_norm 1.6341 (2.0892) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:11:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][460/625] eta 0:01:36 lr 0.000973 wd 0.0500 time 0.5706 (0.5870) data time 0.0006 (0.0017) model time 0.5700 (0.5866) loss 8.5377 (8.0225) grad_norm 1.7159 (2.0830) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:11:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][470/625] eta 0:01:30 lr 0.000972 wd 0.0500 time 0.5723 (0.5868) data time 0.0008 (0.0017) model time 0.5716 (0.5864) loss 8.2525 (8.0310) grad_norm 2.1439 (2.0787) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:12:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][480/625] eta 0:01:25 lr 0.000972 wd 0.0500 time 0.5630 (0.5866) data time 0.0008 (0.0017) model time 0.5622 (0.5862) loss 9.1659 (8.0345) grad_norm 1.5518 (2.0711) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:12:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][490/625] eta 0:01:19 lr 0.000972 wd 0.0500 time 0.5610 (0.5865) data time 0.0007 (0.0016) model time 0.5603 (0.5861) loss 8.2402 (8.0252) grad_norm 1.6389 (2.0677) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:12:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][500/625] eta 0:01:13 lr 0.000972 wd 0.0500 time 0.5704 (0.5864) data time 0.0006 (0.0016) model time 0.5698 (0.5859) loss 8.2059 (8.0080) grad_norm 4.0439 (2.0813) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:12:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][510/625] eta 0:01:07 lr 0.000972 wd 0.0500 time 0.5668 (0.5863) data time 0.0008 (0.0016) model time 0.5660 (0.5857) loss 8.6639 (8.0087) grad_norm 2.7161 (2.0996) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:12:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][520/625] eta 0:01:01 lr 0.000972 wd 0.0500 time 0.5650 (0.5861) data time 0.0008 (0.0016) model time 0.5641 (0.5856) loss 7.1623 (8.0051) grad_norm 1.8801 (2.0997) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:12:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][530/625] eta 0:00:55 lr 0.000972 wd 0.0500 time 0.5672 (0.5861) data time 0.0008 (0.0016) model time 0.5664 (0.5855) loss 8.7714 (8.0150) grad_norm 2.3218 (2.0988) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:12:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][540/625] eta 0:00:49 lr 0.000972 wd 0.0500 time 0.5630 (0.5860) data time 0.0006 (0.0016) model time 0.5624 (0.5854) loss 8.4886 (8.0137) grad_norm 2.0195 (2.0951) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:12:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][550/625] eta 0:00:43 lr 0.000972 wd 0.0500 time 0.5630 (0.5859) data time 0.0006 (0.0016) model time 0.5624 (0.5853) loss 8.8750 (8.0162) grad_norm 3.3976 (2.1017) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:12:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][560/625] eta 0:00:38 lr 0.000972 wd 0.0500 time 0.5671 (0.5858) data time 0.0006 (0.0016) model time 0.5665 (0.5851) loss 8.6705 (8.0210) grad_norm 2.4151 (2.1014) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:12:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][570/625] eta 0:00:32 lr 0.000972 wd 0.0500 time 0.5613 (0.5857) data time 0.0006 (0.0016) model time 0.5607 (0.5850) loss 6.6843 (8.0161) grad_norm 1.4896 (2.0979) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:13:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][580/625] eta 0:00:26 lr 0.000971 wd 0.0500 time 0.5650 (0.5856) data time 0.0009 (0.0016) model time 0.5641 (0.5849) loss 7.2574 (8.0116) grad_norm 1.4476 (2.0933) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:13:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][590/625] eta 0:00:20 lr 0.000971 wd 0.0500 time 0.5638 (0.5854) data time 0.0006 (0.0015) model time 0.5632 (0.5847) loss 7.3151 (8.0083) grad_norm 1.8339 (inf) loss_scale 2048.0000 (4071.7428) mem 22339MB +[2024-07-25 00:13:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][600/625] eta 0:00:14 lr 0.000971 wd 0.0500 time 0.5686 (0.5853) data time 0.0006 (0.0015) model time 0.5680 (0.5846) loss 6.4837 (8.0035) grad_norm 1.5822 (inf) loss_scale 2048.0000 (4038.0699) mem 22339MB +[2024-07-25 00:13:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][610/625] eta 0:00:08 lr 0.000971 wd 0.0500 time 0.5617 (0.5851) data time 0.0004 (0.0015) model time 0.5612 (0.5844) loss 8.5666 (7.9963) grad_norm 2.2936 (inf) loss_scale 2048.0000 (4005.4992) mem 22339MB +[2024-07-25 00:13:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [100/300][620/625] eta 0:00:02 lr 0.000971 wd 0.0500 time 0.5614 (0.5850) data time 0.0004 (0.0015) model time 0.5610 (0.5843) loss 7.0516 (7.9921) grad_norm 2.3198 (inf) loss_scale 2048.0000 (3973.9775) mem 22339MB +[2024-07-25 00:13:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 100 training takes 0:06:05 +[2024-07-25 00:13:28 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 00:13:29 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 00:13:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.469 (0.469) Loss 0.5327 (0.5327) Acc@1 88.672 (88.672) Acc@5 98.486 (98.486) Mem 22339MB +[2024-07-25 00:13:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.8926 (0.6762) Acc@1 78.955 (84.903) Acc@5 95.264 (97.372) Mem 22339MB +[2024-07-25 00:13:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 1.0176 (0.8054) Acc@1 75.488 (81.524) Acc@5 94.092 (96.001) Mem 22339MB +[2024-07-25 00:13:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.218 Acc@5 95.973 +[2024-07-25 00:13:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.2% +[2024-07-25 00:13:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.868 (0.868) Loss 0.5259 (0.5259) Acc@1 89.307 (89.307) Acc@5 98.535 (98.535) Mem 22339MB +[2024-07-25 00:13:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.193) Loss 0.8423 (0.6596) Acc@1 79.883 (85.760) Acc@5 96.045 (97.630) Mem 22339MB +[2024-07-25 00:13:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.161) Loss 0.9512 (0.7730) Acc@1 76.318 (82.536) Acc@5 95.020 (96.417) Mem 22339MB +[2024-07-25 00:13:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.244 Acc@5 96.429 +[2024-07-25 00:13:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.2% +[2024-07-25 00:13:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.24% +[2024-07-25 00:13:36 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 00:13:38 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 00:13:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][0/625] eta 0:10:00 lr 0.000971 wd 0.0500 time 0.9610 (0.9610) data time 0.4428 (0.4428) model time 0.0000 (0.0000) loss 6.9540 (6.9540) grad_norm 1.9010 (1.9010) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:13:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][10/625] eta 0:06:31 lr 0.000971 wd 0.0500 time 0.6915 (0.6360) data time 0.0006 (0.0410) model time 0.0000 (0.0000) loss 8.2199 (7.7285) grad_norm 1.4519 (2.0331) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:13:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][20/625] eta 0:06:31 lr 0.000971 wd 0.0500 time 0.5611 (0.6469) data time 0.0008 (0.0218) model time 0.0000 (0.0000) loss 9.1013 (7.9512) grad_norm 1.7285 (1.9976) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:13:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][30/625] eta 0:06:18 lr 0.000971 wd 0.0500 time 0.5599 (0.6356) data time 0.0006 (0.0151) model time 0.0000 (0.0000) loss 7.1495 (7.8652) grad_norm 2.1758 (2.0098) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:14:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][40/625] eta 0:06:03 lr 0.000971 wd 0.0500 time 0.5679 (0.6215) data time 0.0008 (0.0116) model time 0.0000 (0.0000) loss 7.1609 (7.8459) grad_norm 1.9507 (2.1090) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:14:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][50/625] eta 0:05:52 lr 0.000971 wd 0.0500 time 0.5643 (0.6127) data time 0.0008 (0.0095) model time 0.0000 (0.0000) loss 6.5115 (7.8478) grad_norm 1.6523 (2.0562) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:14:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][60/625] eta 0:05:42 lr 0.000971 wd 0.0500 time 0.5634 (0.6068) data time 0.0006 (0.0080) model time 0.5628 (0.5760) loss 7.9546 (7.8870) grad_norm 1.8435 (2.0235) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:14:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][70/625] eta 0:05:34 lr 0.000971 wd 0.0500 time 0.5640 (0.6026) data time 0.0008 (0.0070) model time 0.5632 (0.5759) loss 8.9711 (7.8425) grad_norm 2.0846 (2.0002) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:14:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][80/625] eta 0:05:26 lr 0.000970 wd 0.0500 time 0.5639 (0.5994) data time 0.0006 (0.0062) model time 0.5633 (0.5759) loss 7.3455 (7.8106) grad_norm 1.6786 (2.0034) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:14:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][90/625] eta 0:05:19 lr 0.000970 wd 0.0500 time 0.5621 (0.5968) data time 0.0008 (0.0057) model time 0.5613 (0.5757) loss 7.0511 (7.8675) grad_norm 1.6923 (2.0108) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:14:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][100/625] eta 0:05:12 lr 0.000970 wd 0.0500 time 0.5650 (0.5949) data time 0.0008 (0.0052) model time 0.5641 (0.5759) loss 9.0055 (7.8619) grad_norm 2.1103 (1.9960) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:14:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][110/625] eta 0:05:05 lr 0.000970 wd 0.0500 time 0.5662 (0.5933) data time 0.0008 (0.0048) model time 0.5654 (0.5759) loss 6.7708 (7.8249) grad_norm 2.5381 (2.0109) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:14:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][120/625] eta 0:04:58 lr 0.000970 wd 0.0500 time 0.5652 (0.5920) data time 0.0008 (0.0045) model time 0.5644 (0.5761) loss 5.9001 (7.7996) grad_norm 1.7539 (2.0177) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:14:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][130/625] eta 0:04:52 lr 0.000970 wd 0.0500 time 0.5700 (0.5909) data time 0.0008 (0.0042) model time 0.5692 (0.5762) loss 7.6032 (7.7601) grad_norm 2.0623 (2.0117) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:15:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][140/625] eta 0:04:46 lr 0.000970 wd 0.0500 time 0.5671 (0.5899) data time 0.0009 (0.0039) model time 0.5662 (0.5762) loss 9.1893 (7.7884) grad_norm 2.5587 (2.0343) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:15:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][150/625] eta 0:04:39 lr 0.000970 wd 0.0500 time 0.5688 (0.5890) data time 0.0008 (0.0037) model time 0.5680 (0.5761) loss 8.1901 (7.7951) grad_norm 1.9628 (2.0308) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:15:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][160/625] eta 0:04:33 lr 0.000970 wd 0.0500 time 0.5664 (0.5884) data time 0.0006 (0.0036) model time 0.5658 (0.5763) loss 7.8598 (7.8097) grad_norm 1.4159 (2.0240) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:15:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][170/625] eta 0:04:27 lr 0.000970 wd 0.0500 time 0.5681 (0.5876) data time 0.0008 (0.0034) model time 0.5674 (0.5761) loss 8.9667 (7.7926) grad_norm 1.5075 (2.0224) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:15:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][180/625] eta 0:04:21 lr 0.000970 wd 0.0500 time 0.5669 (0.5871) data time 0.0008 (0.0033) model time 0.5662 (0.5763) loss 8.6120 (7.8041) grad_norm 1.7853 (2.0258) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:15:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][190/625] eta 0:04:15 lr 0.000970 wd 0.0500 time 0.5650 (0.5867) data time 0.0006 (0.0031) model time 0.5644 (0.5763) loss 6.9201 (7.8240) grad_norm 2.2568 (2.0516) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:15:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][200/625] eta 0:04:09 lr 0.000969 wd 0.0500 time 0.5644 (0.5862) data time 0.0006 (0.0030) model time 0.5638 (0.5764) loss 6.6745 (7.7944) grad_norm 1.6320 (2.0475) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:15:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][210/625] eta 0:04:03 lr 0.000969 wd 0.0500 time 0.5609 (0.5859) data time 0.0008 (0.0029) model time 0.5601 (0.5765) loss 6.0261 (7.7810) grad_norm 1.9239 (2.0421) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:15:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][220/625] eta 0:03:57 lr 0.000969 wd 0.0500 time 0.5644 (0.5859) data time 0.0006 (0.0028) model time 0.5638 (0.5770) loss 8.4551 (7.7928) grad_norm 2.8497 (2.0460) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:15:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][230/625] eta 0:03:51 lr 0.000969 wd 0.0500 time 0.7352 (0.5867) data time 0.0007 (0.0027) model time 0.7345 (0.5785) loss 7.2658 (7.8059) grad_norm 1.9307 (2.0540) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:16:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][240/625] eta 0:03:47 lr 0.000969 wd 0.0500 time 0.7471 (0.5906) data time 0.0009 (0.0027) model time 0.7462 (0.5839) loss 7.5414 (7.7938) grad_norm 1.8497 (2.0535) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:16:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][250/625] eta 0:03:42 lr 0.000969 wd 0.0500 time 0.5668 (0.5921) data time 0.0008 (0.0026) model time 0.5660 (0.5860) loss 8.3280 (7.7901) grad_norm 1.8082 (2.0491) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:16:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][260/625] eta 0:03:35 lr 0.000969 wd 0.0500 time 0.5692 (0.5915) data time 0.0007 (0.0025) model time 0.5685 (0.5855) loss 6.8205 (7.7963) grad_norm 3.3925 (2.0537) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:16:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][270/625] eta 0:03:29 lr 0.000969 wd 0.0500 time 0.5634 (0.5911) data time 0.0006 (0.0025) model time 0.5627 (0.5852) loss 7.9549 (7.7953) grad_norm 2.1300 (2.0670) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:16:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][280/625] eta 0:03:23 lr 0.000969 wd 0.0500 time 0.5676 (0.5906) data time 0.0007 (0.0024) model time 0.5670 (0.5848) loss 9.4091 (7.8254) grad_norm 1.6105 (2.0632) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:16:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][290/625] eta 0:03:17 lr 0.000969 wd 0.0500 time 0.5636 (0.5901) data time 0.0008 (0.0024) model time 0.5628 (0.5845) loss 7.4370 (7.8425) grad_norm 2.0639 (2.0641) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:16:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][300/625] eta 0:03:11 lr 0.000969 wd 0.0500 time 0.5650 (0.5897) data time 0.0006 (0.0023) model time 0.5644 (0.5841) loss 8.4286 (7.8539) grad_norm 1.4985 (2.0558) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:16:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][310/625] eta 0:03:05 lr 0.000969 wd 0.0500 time 0.5624 (0.5893) data time 0.0006 (0.0023) model time 0.5618 (0.5839) loss 7.8227 (7.8424) grad_norm 2.4532 (2.0590) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:16:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][320/625] eta 0:02:59 lr 0.000968 wd 0.0500 time 0.5640 (0.5889) data time 0.0006 (0.0022) model time 0.5634 (0.5835) loss 8.3000 (7.8419) grad_norm 2.0286 (2.0682) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:16:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][330/625] eta 0:02:53 lr 0.000968 wd 0.0500 time 0.5707 (0.5886) data time 0.0006 (0.0022) model time 0.5701 (0.5833) loss 7.1034 (7.8446) grad_norm 1.8932 (2.0737) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:16:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][340/625] eta 0:02:47 lr 0.000968 wd 0.0500 time 0.5658 (0.5883) data time 0.0006 (0.0021) model time 0.5651 (0.5831) loss 8.9970 (7.8470) grad_norm 2.3219 (2.0803) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:17:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][350/625] eta 0:02:41 lr 0.000968 wd 0.0500 time 0.5637 (0.5880) data time 0.0006 (0.0021) model time 0.5631 (0.5830) loss 7.4919 (7.8468) grad_norm 2.3902 (2.0787) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:17:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][360/625] eta 0:02:35 lr 0.000968 wd 0.0500 time 0.5707 (0.5877) data time 0.0006 (0.0021) model time 0.5701 (0.5828) loss 7.0705 (7.8409) grad_norm 2.4583 (2.0876) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:17:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][370/625] eta 0:02:29 lr 0.000968 wd 0.0500 time 0.5630 (0.5874) data time 0.0006 (0.0020) model time 0.5624 (0.5825) loss 7.3135 (7.8535) grad_norm 2.1706 (2.0823) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:17:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][380/625] eta 0:02:23 lr 0.000968 wd 0.0500 time 0.5650 (0.5871) data time 0.0008 (0.0020) model time 0.5642 (0.5823) loss 8.2172 (7.8548) grad_norm 1.5067 (2.0802) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:17:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][390/625] eta 0:02:17 lr 0.000968 wd 0.0500 time 0.5774 (0.5869) data time 0.0008 (0.0020) model time 0.5766 (0.5822) loss 8.8487 (7.8764) grad_norm 2.1491 (2.0805) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:17:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][400/625] eta 0:02:12 lr 0.000968 wd 0.0500 time 0.5655 (0.5867) data time 0.0009 (0.0019) model time 0.5646 (0.5820) loss 6.9151 (7.8730) grad_norm 2.6279 (2.0994) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:17:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][410/625] eta 0:02:06 lr 0.000968 wd 0.0500 time 0.5650 (0.5864) data time 0.0009 (0.0019) model time 0.5642 (0.5819) loss 9.2113 (7.8633) grad_norm 2.4748 (2.0961) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:17:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][420/625] eta 0:02:00 lr 0.000968 wd 0.0500 time 0.5626 (0.5862) data time 0.0006 (0.0019) model time 0.5620 (0.5817) loss 7.6253 (7.8685) grad_norm 1.8154 (2.0940) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:17:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][430/625] eta 0:01:54 lr 0.000967 wd 0.0500 time 0.5651 (0.5860) data time 0.0008 (0.0019) model time 0.5644 (0.5816) loss 6.7371 (7.8723) grad_norm 1.9511 (2.0978) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:17:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][440/625] eta 0:01:48 lr 0.000967 wd 0.0500 time 0.5199 (0.5862) data time 0.0009 (0.0018) model time 0.5190 (0.5819) loss 6.9543 (7.8718) grad_norm 2.0310 (2.0984) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:18:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][450/625] eta 0:01:42 lr 0.000967 wd 0.0500 time 0.7344 (0.5869) data time 0.0006 (0.0018) model time 0.7337 (0.5828) loss 6.9211 (7.8704) grad_norm 2.7760 (2.0978) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:18:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][460/625] eta 0:01:37 lr 0.000967 wd 0.0500 time 0.7195 (0.5883) data time 0.0008 (0.0018) model time 0.7188 (0.5844) loss 8.1926 (7.8664) grad_norm 2.0100 (2.1007) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:18:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][470/625] eta 0:01:31 lr 0.000967 wd 0.0500 time 0.5694 (0.5888) data time 0.0006 (0.0018) model time 0.5688 (0.5850) loss 7.9310 (7.8781) grad_norm 1.6644 (2.0999) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:18:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][480/625] eta 0:01:25 lr 0.000967 wd 0.0500 time 0.5693 (0.5886) data time 0.0008 (0.0018) model time 0.5685 (0.5849) loss 9.3879 (7.8797) grad_norm 2.1271 (2.0980) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:18:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][490/625] eta 0:01:19 lr 0.000967 wd 0.0500 time 0.5639 (0.5883) data time 0.0006 (0.0017) model time 0.5633 (0.5847) loss 7.7634 (7.8769) grad_norm 3.1110 (2.0999) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:18:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][500/625] eta 0:01:13 lr 0.000967 wd 0.0500 time 0.5638 (0.5881) data time 0.0006 (0.0017) model time 0.5632 (0.5845) loss 8.2188 (7.8767) grad_norm 1.7059 (2.1000) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:18:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][510/625] eta 0:01:07 lr 0.000967 wd 0.0500 time 0.5651 (0.5879) data time 0.0006 (0.0017) model time 0.5645 (0.5843) loss 7.2356 (7.8755) grad_norm 1.8591 (2.1078) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:18:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][520/625] eta 0:01:01 lr 0.000967 wd 0.0500 time 0.5622 (0.5877) data time 0.0006 (0.0017) model time 0.5615 (0.5841) loss 5.8985 (7.8784) grad_norm 1.6689 (2.1070) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:18:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][530/625] eta 0:00:55 lr 0.000967 wd 0.0500 time 0.5622 (0.5875) data time 0.0009 (0.0017) model time 0.5614 (0.5840) loss 7.0255 (7.8706) grad_norm 2.3855 (2.1086) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:18:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][540/625] eta 0:00:49 lr 0.000967 wd 0.0500 time 0.5645 (0.5873) data time 0.0008 (0.0017) model time 0.5636 (0.5838) loss 8.7117 (7.8715) grad_norm 1.8141 (2.1088) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:19:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][550/625] eta 0:00:44 lr 0.000966 wd 0.0500 time 0.5642 (0.5872) data time 0.0007 (0.0016) model time 0.5635 (0.5837) loss 9.1722 (7.8784) grad_norm 1.5209 (2.1047) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:19:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][560/625] eta 0:00:38 lr 0.000966 wd 0.0500 time 0.5628 (0.5870) data time 0.0006 (0.0016) model time 0.5621 (0.5836) loss 7.4672 (7.8777) grad_norm 2.0006 (2.1013) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:19:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][570/625] eta 0:00:32 lr 0.000966 wd 0.0500 time 0.5671 (0.5869) data time 0.0006 (0.0016) model time 0.5664 (0.5835) loss 7.1039 (7.8748) grad_norm 1.8131 (2.0985) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:19:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][580/625] eta 0:00:26 lr 0.000966 wd 0.0500 time 0.5659 (0.5868) data time 0.0008 (0.0016) model time 0.5651 (0.5835) loss 9.2120 (7.8782) grad_norm 1.7975 (2.0960) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:19:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][590/625] eta 0:00:20 lr 0.000966 wd 0.0500 time 0.5635 (0.5867) data time 0.0008 (0.0016) model time 0.5627 (0.5834) loss 7.0026 (7.8823) grad_norm 1.7089 (2.0922) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:19:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][600/625] eta 0:00:14 lr 0.000966 wd 0.0500 time 0.5689 (0.5866) data time 0.0006 (0.0016) model time 0.5682 (0.5833) loss 7.7660 (7.8814) grad_norm 2.0964 (2.0903) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:19:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][610/625] eta 0:00:08 lr 0.000966 wd 0.0500 time 0.5712 (0.5864) data time 0.0004 (0.0016) model time 0.5708 (0.5831) loss 7.8022 (7.8777) grad_norm 1.5647 (2.0888) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:19:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [101/300][620/625] eta 0:00:02 lr 0.000966 wd 0.0500 time 0.5631 (0.5863) data time 0.0004 (0.0016) model time 0.5627 (0.5831) loss 7.6296 (7.8764) grad_norm 1.9302 (2.0891) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:19:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 101 training takes 0:06:06 +[2024-07-25 00:19:45 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 00:19:46 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 00:19:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.465 (0.465) Loss 0.5244 (0.5244) Acc@1 89.209 (89.209) Acc@5 98.389 (98.389) Mem 22339MB +[2024-07-25 00:19:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.157) Loss 0.8911 (0.6728) Acc@1 78.564 (85.019) Acc@5 95.361 (97.394) Mem 22339MB +[2024-07-25 00:19:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.9917 (0.8078) Acc@1 75.830 (81.545) Acc@5 93.896 (95.857) Mem 22339MB +[2024-07-25 00:19:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.308 Acc@5 95.877 +[2024-07-25 00:19:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.3% +[2024-07-25 00:19:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.839 (0.839) Loss 0.5239 (0.5239) Acc@1 89.404 (89.404) Acc@5 98.486 (98.486) Mem 22339MB +[2024-07-25 00:19:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.191) Loss 0.8403 (0.6578) Acc@1 79.980 (85.804) Acc@5 96.045 (97.625) Mem 22339MB +[2024-07-25 00:19:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.160) Loss 0.9497 (0.7710) Acc@1 76.270 (82.564) Acc@5 94.971 (96.426) Mem 22339MB +[2024-07-25 00:19:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.266 Acc@5 96.439 +[2024-07-25 00:19:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.3% +[2024-07-25 00:19:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.27% +[2024-07-25 00:19:53 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 00:19:55 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 00:19:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][0/625] eta 0:09:44 lr 0.000966 wd 0.0500 time 0.9358 (0.9358) data time 0.4142 (0.4142) model time 0.0000 (0.0000) loss 7.0154 (7.0154) grad_norm 1.7986 (1.7986) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:20:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][10/625] eta 0:06:14 lr 0.000966 wd 0.0500 time 0.5662 (0.6087) data time 0.0008 (0.0384) model time 0.0000 (0.0000) loss 9.6007 (7.7511) grad_norm 1.6320 (2.0622) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:20:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][20/625] eta 0:05:59 lr 0.000966 wd 0.0500 time 0.5673 (0.5941) data time 0.0008 (0.0205) model time 0.0000 (0.0000) loss 7.1710 (7.7917) grad_norm 1.8217 (2.0610) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:20:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][30/625] eta 0:05:49 lr 0.000966 wd 0.0500 time 0.5652 (0.5877) data time 0.0006 (0.0141) model time 0.0000 (0.0000) loss 8.3548 (7.8611) grad_norm 2.0508 (2.0223) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:20:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][40/625] eta 0:05:42 lr 0.000965 wd 0.0500 time 0.5646 (0.5853) data time 0.0006 (0.0109) model time 0.0000 (0.0000) loss 8.9266 (7.8447) grad_norm 2.8596 (2.0958) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:20:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][50/625] eta 0:05:42 lr 0.000965 wd 0.0500 time 0.7270 (0.5955) data time 0.0008 (0.0089) model time 0.0000 (0.0000) loss 8.3798 (7.9072) grad_norm 3.1012 (2.1721) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:20:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][60/625] eta 0:05:44 lr 0.000965 wd 0.0500 time 0.7005 (0.6095) data time 0.0006 (0.0076) model time 0.6999 (0.6802) loss 7.2510 (7.8184) grad_norm 1.8420 (2.1188) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:20:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][70/625] eta 0:05:36 lr 0.000965 wd 0.0500 time 0.5627 (0.6063) data time 0.0007 (0.0066) model time 0.5620 (0.6330) loss 5.7458 (7.7218) grad_norm 2.3276 (2.0715) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:20:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][80/625] eta 0:05:28 lr 0.000965 wd 0.0500 time 0.5679 (0.6026) data time 0.0006 (0.0059) model time 0.5673 (0.6140) loss 7.5018 (7.7706) grad_norm 1.7224 (2.0616) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:20:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][90/625] eta 0:05:20 lr 0.000965 wd 0.0500 time 0.5653 (0.5998) data time 0.0008 (0.0054) model time 0.5645 (0.6044) loss 7.6174 (7.7587) grad_norm 2.3563 (2.0415) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:20:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][100/625] eta 0:05:13 lr 0.000965 wd 0.0500 time 0.5689 (0.5976) data time 0.0008 (0.0049) model time 0.5681 (0.5990) loss 9.0631 (7.7682) grad_norm 2.6155 (2.0509) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:21:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][110/625] eta 0:05:06 lr 0.000965 wd 0.0500 time 0.5664 (0.5958) data time 0.0008 (0.0045) model time 0.5656 (0.5953) loss 8.1506 (7.7848) grad_norm 1.4404 (2.0436) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:21:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][120/625] eta 0:05:00 lr 0.000965 wd 0.0500 time 0.5638 (0.5944) data time 0.0008 (0.0042) model time 0.5630 (0.5927) loss 9.0335 (7.7806) grad_norm 2.0100 (2.0263) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:21:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][130/625] eta 0:04:53 lr 0.000965 wd 0.0500 time 0.5663 (0.5931) data time 0.0007 (0.0040) model time 0.5656 (0.5907) loss 7.2975 (7.7950) grad_norm 3.1533 (2.0402) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:21:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][140/625] eta 0:04:47 lr 0.000965 wd 0.0500 time 0.5627 (0.5920) data time 0.0009 (0.0038) model time 0.5618 (0.5892) loss 8.1276 (7.7956) grad_norm 1.4968 (2.0207) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:21:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][150/625] eta 0:04:40 lr 0.000965 wd 0.0500 time 0.5638 (0.5910) data time 0.0006 (0.0036) model time 0.5633 (0.5878) loss 7.1293 (7.8038) grad_norm 2.1718 (2.0215) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:21:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][160/625] eta 0:04:34 lr 0.000964 wd 0.0500 time 0.5728 (0.5902) data time 0.0008 (0.0034) model time 0.5721 (0.5869) loss 7.4866 (7.8114) grad_norm 4.0933 (2.0322) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:21:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][170/625] eta 0:04:28 lr 0.000964 wd 0.0500 time 0.5612 (0.5894) data time 0.0007 (0.0033) model time 0.5605 (0.5859) loss 6.3954 (7.8188) grad_norm 2.0969 (2.0587) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:21:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][180/625] eta 0:04:21 lr 0.000964 wd 0.0500 time 0.5634 (0.5887) data time 0.0008 (0.0031) model time 0.5627 (0.5852) loss 8.2723 (7.8217) grad_norm 1.5463 (2.0727) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:21:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][190/625] eta 0:04:15 lr 0.000964 wd 0.0500 time 0.5663 (0.5881) data time 0.0007 (0.0030) model time 0.5656 (0.5845) loss 8.8829 (7.8372) grad_norm 1.5360 (2.0566) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:21:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][200/625] eta 0:04:09 lr 0.000964 wd 0.0500 time 0.5644 (0.5878) data time 0.0008 (0.0029) model time 0.5636 (0.5843) loss 8.6666 (7.8567) grad_norm 2.1606 (2.0624) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:21:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][210/625] eta 0:04:03 lr 0.000964 wd 0.0500 time 0.5634 (0.5873) data time 0.0008 (0.0028) model time 0.5627 (0.5839) loss 7.7285 (7.8721) grad_norm 2.3111 (2.0656) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:22:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][220/625] eta 0:03:57 lr 0.000964 wd 0.0500 time 0.5659 (0.5869) data time 0.0008 (0.0027) model time 0.5651 (0.5835) loss 6.7109 (7.8390) grad_norm 1.9612 (2.0782) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:22:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][230/625] eta 0:03:51 lr 0.000964 wd 0.0500 time 0.5621 (0.5865) data time 0.0009 (0.0026) model time 0.5612 (0.5831) loss 8.5011 (7.8514) grad_norm 2.4200 (2.0802) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:22:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][240/625] eta 0:03:45 lr 0.000964 wd 0.0500 time 0.5622 (0.5862) data time 0.0009 (0.0026) model time 0.5613 (0.5829) loss 7.1950 (7.8379) grad_norm 3.0026 (2.0852) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:22:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][250/625] eta 0:03:39 lr 0.000964 wd 0.0500 time 0.5644 (0.5859) data time 0.0008 (0.0025) model time 0.5636 (0.5826) loss 7.5324 (7.8321) grad_norm 1.8993 (2.0857) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:22:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][260/625] eta 0:03:33 lr 0.000964 wd 0.0500 time 0.5657 (0.5856) data time 0.0007 (0.0024) model time 0.5650 (0.5824) loss 8.5834 (7.8488) grad_norm 2.6181 (2.0825) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:22:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][270/625] eta 0:03:28 lr 0.000964 wd 0.0500 time 0.7572 (0.5882) data time 0.0006 (0.0024) model time 0.7566 (0.5856) loss 8.0242 (7.8445) grad_norm 3.1161 (2.1046) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:22:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][280/625] eta 0:03:23 lr 0.000963 wd 0.0500 time 0.5619 (0.5902) data time 0.0008 (0.0023) model time 0.5611 (0.5881) loss 7.7494 (7.8288) grad_norm 1.5281 (2.1155) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:22:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][290/625] eta 0:03:17 lr 0.000963 wd 0.0500 time 0.5646 (0.5907) data time 0.0006 (0.0023) model time 0.5640 (0.5889) loss 6.5724 (7.8335) grad_norm 2.4659 (2.1106) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:22:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][300/625] eta 0:03:11 lr 0.000963 wd 0.0500 time 0.5697 (0.5905) data time 0.0006 (0.0022) model time 0.5691 (0.5886) loss 8.6862 (7.8221) grad_norm 2.2601 (2.1036) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:22:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][310/625] eta 0:03:05 lr 0.000963 wd 0.0500 time 0.5661 (0.5902) data time 0.0008 (0.0022) model time 0.5653 (0.5883) loss 8.1681 (7.8229) grad_norm 2.1807 (2.1034) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:23:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][320/625] eta 0:02:59 lr 0.000963 wd 0.0500 time 0.5669 (0.5899) data time 0.0008 (0.0021) model time 0.5661 (0.5879) loss 8.7609 (7.8342) grad_norm 3.0751 (2.1164) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:23:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][330/625] eta 0:02:53 lr 0.000963 wd 0.0500 time 0.5671 (0.5895) data time 0.0006 (0.0021) model time 0.5664 (0.5876) loss 7.2357 (7.8301) grad_norm 2.8282 (2.1170) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:23:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][340/625] eta 0:02:47 lr 0.000963 wd 0.0500 time 0.5624 (0.5893) data time 0.0006 (0.0021) model time 0.5618 (0.5874) loss 8.6850 (7.8245) grad_norm 2.0588 (2.1198) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:23:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][350/625] eta 0:02:42 lr 0.000963 wd 0.0500 time 0.5683 (0.5891) data time 0.0008 (0.0020) model time 0.5675 (0.5872) loss 5.8426 (7.8331) grad_norm 2.1532 (2.1143) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:23:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][360/625] eta 0:02:36 lr 0.000963 wd 0.0500 time 0.5618 (0.5890) data time 0.0006 (0.0020) model time 0.5612 (0.5871) loss 7.7447 (7.8188) grad_norm 1.5053 (2.1099) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:23:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][370/625] eta 0:02:30 lr 0.000963 wd 0.0500 time 0.5707 (0.5887) data time 0.0008 (0.0020) model time 0.5699 (0.5868) loss 8.3121 (7.8331) grad_norm 2.0884 (2.1111) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:23:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][380/625] eta 0:02:24 lr 0.000963 wd 0.0500 time 0.5654 (0.5884) data time 0.0006 (0.0019) model time 0.5647 (0.5865) loss 8.5566 (7.8340) grad_norm 1.6728 (2.1184) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:23:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][390/625] eta 0:02:18 lr 0.000963 wd 0.0500 time 0.5625 (0.5882) data time 0.0008 (0.0019) model time 0.5617 (0.5862) loss 7.9531 (7.8423) grad_norm 2.4352 (2.1180) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:23:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][400/625] eta 0:02:12 lr 0.000962 wd 0.0500 time 0.5637 (0.5880) data time 0.0008 (0.0019) model time 0.5629 (0.5861) loss 9.6372 (7.8494) grad_norm 1.5145 (2.1183) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:23:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][410/625] eta 0:02:06 lr 0.000962 wd 0.0500 time 0.5769 (0.5878) data time 0.0009 (0.0019) model time 0.5760 (0.5859) loss 7.9506 (7.8581) grad_norm 2.9366 (2.1156) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:24:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][420/625] eta 0:02:00 lr 0.000962 wd 0.0500 time 0.5653 (0.5880) data time 0.0008 (0.0018) model time 0.5646 (0.5861) loss 7.3427 (7.8529) grad_norm 2.0498 (2.1174) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:24:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][430/625] eta 0:01:54 lr 0.000962 wd 0.0500 time 0.5712 (0.5877) data time 0.0007 (0.0018) model time 0.5705 (0.5859) loss 7.0502 (7.8529) grad_norm 1.7737 (2.1165) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:24:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][440/625] eta 0:01:48 lr 0.000962 wd 0.0500 time 0.5643 (0.5876) data time 0.0006 (0.0018) model time 0.5637 (0.5857) loss 9.0946 (7.8586) grad_norm 1.8259 (2.1158) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:24:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][450/625] eta 0:01:42 lr 0.000962 wd 0.0500 time 0.5631 (0.5875) data time 0.0008 (0.0018) model time 0.5622 (0.5856) loss 6.8713 (7.8564) grad_norm 1.5801 (2.1097) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:24:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][460/625] eta 0:01:36 lr 0.000962 wd 0.0500 time 0.5614 (0.5872) data time 0.0008 (0.0017) model time 0.5606 (0.5854) loss 8.0059 (7.8482) grad_norm 1.9436 (2.1110) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:24:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][470/625] eta 0:01:30 lr 0.000962 wd 0.0500 time 0.5647 (0.5870) data time 0.0006 (0.0017) model time 0.5641 (0.5852) loss 6.4598 (7.8392) grad_norm 2.2739 (2.1093) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:24:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][480/625] eta 0:01:25 lr 0.000962 wd 0.0500 time 0.5662 (0.5868) data time 0.0008 (0.0017) model time 0.5654 (0.5850) loss 9.8689 (7.8464) grad_norm 1.7977 (2.1120) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:24:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][490/625] eta 0:01:19 lr 0.000962 wd 0.0500 time 0.6980 (0.5876) data time 0.0006 (0.0017) model time 0.6974 (0.5858) loss 5.6223 (7.8475) grad_norm 1.9855 (2.1106) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:24:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][500/625] eta 0:01:13 lr 0.000962 wd 0.0500 time 0.5643 (0.5887) data time 0.0006 (0.0017) model time 0.5637 (0.5871) loss 8.5964 (7.8505) grad_norm 1.8548 (2.1169) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:24:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][510/625] eta 0:01:07 lr 0.000961 wd 0.0500 time 0.5669 (0.5890) data time 0.0008 (0.0017) model time 0.5662 (0.5874) loss 8.0492 (7.8426) grad_norm 1.9897 (2.1155) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:25:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][520/625] eta 0:01:01 lr 0.000961 wd 0.0500 time 0.5695 (0.5888) data time 0.0006 (0.0016) model time 0.5689 (0.5872) loss 7.3494 (7.8457) grad_norm 1.7560 (2.1166) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:25:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][530/625] eta 0:00:55 lr 0.000961 wd 0.0500 time 0.5684 (0.5886) data time 0.0008 (0.0016) model time 0.5676 (0.5871) loss 6.6766 (7.8321) grad_norm 1.7337 (2.1323) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:25:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][540/625] eta 0:00:50 lr 0.000961 wd 0.0500 time 0.5690 (0.5884) data time 0.0009 (0.0016) model time 0.5680 (0.5868) loss 8.1280 (7.8357) grad_norm 1.4430 (2.1281) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:25:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][550/625] eta 0:00:44 lr 0.000961 wd 0.0500 time 0.5753 (0.5882) data time 0.0006 (0.0016) model time 0.5747 (0.5866) loss 7.1444 (7.8299) grad_norm 2.3857 (2.1266) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:25:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][560/625] eta 0:00:38 lr 0.000961 wd 0.0500 time 0.5631 (0.5880) data time 0.0006 (0.0016) model time 0.5625 (0.5864) loss 8.1009 (7.8281) grad_norm 2.2318 (2.1303) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:25:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][570/625] eta 0:00:32 lr 0.000961 wd 0.0500 time 0.5643 (0.5878) data time 0.0006 (0.0016) model time 0.5637 (0.5862) loss 8.4735 (7.8350) grad_norm 1.8703 (2.1270) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:25:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][580/625] eta 0:00:26 lr 0.000961 wd 0.0500 time 0.5699 (0.5877) data time 0.0009 (0.0016) model time 0.5691 (0.5861) loss 7.9934 (7.8388) grad_norm 3.2986 (2.1328) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:25:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][590/625] eta 0:00:20 lr 0.000961 wd 0.0500 time 0.5628 (0.5875) data time 0.0006 (0.0016) model time 0.5622 (0.5859) loss 8.4279 (7.8425) grad_norm 2.4566 (2.1325) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:25:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][600/625] eta 0:00:14 lr 0.000961 wd 0.0500 time 0.5634 (0.5874) data time 0.0008 (0.0015) model time 0.5626 (0.5858) loss 8.6955 (7.8440) grad_norm 2.0299 (2.1324) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:25:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][610/625] eta 0:00:08 lr 0.000961 wd 0.0500 time 0.5656 (0.5872) data time 0.0004 (0.0015) model time 0.5652 (0.5856) loss 6.9994 (7.8469) grad_norm 1.5111 (2.1292) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:26:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [102/300][620/625] eta 0:00:02 lr 0.000961 wd 0.0500 time 0.5654 (0.5872) data time 0.0006 (0.0015) model time 0.5648 (0.5856) loss 8.2865 (7.8468) grad_norm 2.1864 (2.1284) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:26:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 102 training takes 0:06:06 +[2024-07-25 00:26:02 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 00:26:04 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 00:26:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.484 (0.484) Loss 0.5498 (0.5498) Acc@1 88.184 (88.184) Acc@5 98.584 (98.584) Mem 22339MB +[2024-07-25 00:26:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.9058 (0.6901) Acc@1 78.467 (84.974) Acc@5 95.361 (97.377) Mem 22339MB +[2024-07-25 00:26:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 1.0381 (0.8175) Acc@1 74.805 (81.559) Acc@5 93.994 (95.936) Mem 22339MB +[2024-07-25 00:26:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.270 Acc@5 95.943 +[2024-07-25 00:26:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.3% +[2024-07-25 00:26:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.788 (0.788) Loss 0.5220 (0.5220) Acc@1 89.404 (89.404) Acc@5 98.486 (98.486) Mem 22339MB +[2024-07-25 00:26:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.186) Loss 0.8384 (0.6561) Acc@1 80.127 (85.853) Acc@5 96.143 (97.638) Mem 22339MB +[2024-07-25 00:26:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.157) Loss 0.9473 (0.7691) Acc@1 76.318 (82.622) Acc@5 94.873 (96.431) Mem 22339MB +[2024-07-25 00:26:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.312 Acc@5 96.445 +[2024-07-25 00:26:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.3% +[2024-07-25 00:26:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.31% +[2024-07-25 00:26:11 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 00:26:12 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 00:26:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][0/625] eta 0:09:37 lr 0.000961 wd 0.0500 time 0.9235 (0.9235) data time 0.4059 (0.4059) model time 0.0000 (0.0000) loss 9.6111 (9.6111) grad_norm 1.8687 (1.8687) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:26:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][10/625] eta 0:06:15 lr 0.000960 wd 0.0500 time 0.5601 (0.6107) data time 0.0006 (0.0377) model time 0.0000 (0.0000) loss 6.9855 (7.8325) grad_norm 1.8730 (1.8446) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:26:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][20/625] eta 0:05:59 lr 0.000960 wd 0.0500 time 0.5645 (0.5947) data time 0.0007 (0.0201) model time 0.0000 (0.0000) loss 9.0353 (7.9355) grad_norm 1.7948 (1.8463) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:26:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][30/625] eta 0:05:50 lr 0.000960 wd 0.0500 time 0.5677 (0.5891) data time 0.0005 (0.0139) model time 0.0000 (0.0000) loss 8.9684 (7.9143) grad_norm 2.9472 (1.8692) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:26:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][40/625] eta 0:05:43 lr 0.000960 wd 0.0500 time 0.5630 (0.5864) data time 0.0006 (0.0107) model time 0.0000 (0.0000) loss 7.1354 (7.8587) grad_norm 1.8233 (2.0037) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:26:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][50/625] eta 0:05:36 lr 0.000960 wd 0.0500 time 0.5666 (0.5849) data time 0.0006 (0.0087) model time 0.0000 (0.0000) loss 7.3984 (7.8752) grad_norm 2.4621 (2.0480) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:26:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][60/625] eta 0:05:30 lr 0.000960 wd 0.0500 time 0.5605 (0.5844) data time 0.0008 (0.0074) model time 0.5598 (0.5812) loss 8.2490 (7.8826) grad_norm 2.6158 (2.0347) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:26:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][70/625] eta 0:05:24 lr 0.000960 wd 0.0500 time 0.5633 (0.5840) data time 0.0006 (0.0067) model time 0.5627 (0.5805) loss 7.2330 (7.8583) grad_norm 2.1324 (2.0988) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:27:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][80/625] eta 0:05:18 lr 0.000960 wd 0.0500 time 0.5629 (0.5845) data time 0.0006 (0.0059) model time 0.5624 (0.5828) loss 6.9833 (7.8755) grad_norm 1.6600 (2.1381) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:27:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][90/625] eta 0:05:15 lr 0.000960 wd 0.0500 time 0.7131 (0.5900) data time 0.0006 (0.0054) model time 0.7126 (0.5956) loss 7.2606 (7.8423) grad_norm 2.1592 (2.1375) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:27:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][100/625] eta 0:05:11 lr 0.000960 wd 0.0500 time 0.7366 (0.5936) data time 0.0006 (0.0050) model time 0.7360 (0.6013) loss 7.0442 (7.8285) grad_norm 2.2884 (2.1532) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:27:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][110/625] eta 0:05:05 lr 0.000960 wd 0.0500 time 0.5609 (0.5939) data time 0.0008 (0.0046) model time 0.5601 (0.6005) loss 7.8135 (7.8109) grad_norm 2.1250 (2.1218) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:27:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][120/625] eta 0:04:59 lr 0.000959 wd 0.0500 time 0.5607 (0.5932) data time 0.0009 (0.0043) model time 0.5599 (0.5982) loss 8.4046 (7.8133) grad_norm 2.3655 (2.1002) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:27:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][130/625] eta 0:04:53 lr 0.000959 wd 0.0500 time 0.5615 (0.5921) data time 0.0006 (0.0040) model time 0.5609 (0.5956) loss 8.4159 (7.8230) grad_norm 1.4433 (2.0982) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:27:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][140/625] eta 0:04:46 lr 0.000959 wd 0.0500 time 0.5642 (0.5911) data time 0.0008 (0.0038) model time 0.5634 (0.5937) loss 9.3786 (7.8630) grad_norm 2.3804 (2.1251) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:27:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][150/625] eta 0:04:40 lr 0.000959 wd 0.0500 time 0.5673 (0.5905) data time 0.0009 (0.0036) model time 0.5664 (0.5923) loss 8.3962 (7.8727) grad_norm 1.8246 (2.1239) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:27:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][160/625] eta 0:04:34 lr 0.000959 wd 0.0500 time 0.5643 (0.5897) data time 0.0006 (0.0034) model time 0.5637 (0.5909) loss 7.7942 (7.8929) grad_norm 2.4770 (2.1309) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:27:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][170/625] eta 0:04:28 lr 0.000959 wd 0.0500 time 0.5683 (0.5890) data time 0.0006 (0.0033) model time 0.5677 (0.5898) loss 7.8835 (7.8977) grad_norm 1.9998 (2.1511) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:27:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][180/625] eta 0:04:22 lr 0.000959 wd 0.0500 time 0.5651 (0.5891) data time 0.0008 (0.0031) model time 0.5643 (0.5898) loss 6.6178 (7.8823) grad_norm 3.0951 (2.1560) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:28:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][190/625] eta 0:04:15 lr 0.000959 wd 0.0500 time 0.5666 (0.5885) data time 0.0008 (0.0030) model time 0.5658 (0.5888) loss 6.6714 (7.8730) grad_norm 2.5104 (2.1766) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:28:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][200/625] eta 0:04:09 lr 0.000959 wd 0.0500 time 0.5682 (0.5879) data time 0.0006 (0.0029) model time 0.5676 (0.5880) loss 8.6465 (7.9000) grad_norm 2.5360 (2.1847) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:28:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][210/625] eta 0:04:03 lr 0.000959 wd 0.0500 time 0.5627 (0.5874) data time 0.0006 (0.0028) model time 0.5622 (0.5873) loss 7.7421 (7.8953) grad_norm 2.8122 (2.2062) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:28:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][220/625] eta 0:03:57 lr 0.000959 wd 0.0500 time 0.5684 (0.5870) data time 0.0006 (0.0027) model time 0.5678 (0.5867) loss 6.0218 (7.8818) grad_norm 2.1256 (2.1919) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:28:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][230/625] eta 0:03:51 lr 0.000959 wd 0.0500 time 0.5678 (0.5866) data time 0.0007 (0.0026) model time 0.5671 (0.5861) loss 8.3699 (7.8985) grad_norm 1.6427 (2.1738) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:28:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][240/625] eta 0:03:45 lr 0.000958 wd 0.0500 time 0.5659 (0.5862) data time 0.0008 (0.0026) model time 0.5652 (0.5856) loss 9.4687 (7.9148) grad_norm 2.0377 (2.1612) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:28:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][250/625] eta 0:03:39 lr 0.000958 wd 0.0500 time 0.5720 (0.5858) data time 0.0006 (0.0025) model time 0.5714 (0.5852) loss 6.2466 (7.9039) grad_norm 2.5279 (2.1555) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:28:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][260/625] eta 0:03:33 lr 0.000958 wd 0.0500 time 0.5695 (0.5856) data time 0.0006 (0.0024) model time 0.5689 (0.5848) loss 9.1035 (7.9086) grad_norm 3.5424 (2.1778) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:28:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][270/625] eta 0:03:27 lr 0.000958 wd 0.0500 time 0.5670 (0.5852) data time 0.0006 (0.0024) model time 0.5664 (0.5844) loss 8.4520 (7.9122) grad_norm 1.8819 (2.1831) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:28:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][280/625] eta 0:03:21 lr 0.000958 wd 0.0500 time 0.5649 (0.5849) data time 0.0006 (0.0023) model time 0.5643 (0.5840) loss 7.4763 (7.9183) grad_norm 2.6412 (2.1882) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:29:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][290/625] eta 0:03:15 lr 0.000958 wd 0.0500 time 0.5635 (0.5846) data time 0.0006 (0.0023) model time 0.5629 (0.5837) loss 6.2065 (7.9098) grad_norm 2.6563 (2.1854) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:29:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][300/625] eta 0:03:09 lr 0.000958 wd 0.0500 time 0.5737 (0.5844) data time 0.0008 (0.0022) model time 0.5729 (0.5835) loss 7.0481 (7.9179) grad_norm 1.6185 (2.1730) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:29:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][310/625] eta 0:03:04 lr 0.000958 wd 0.0500 time 0.7320 (0.5861) data time 0.0008 (0.0022) model time 0.7312 (0.5855) loss 8.0578 (7.9159) grad_norm 1.5844 (2.1626) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:29:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][320/625] eta 0:02:59 lr 0.000958 wd 0.0500 time 0.7287 (0.5875) data time 0.0008 (0.0021) model time 0.7278 (0.5871) loss 9.6889 (7.9251) grad_norm 2.4281 (2.1585) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:29:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][330/625] eta 0:02:53 lr 0.000958 wd 0.0500 time 0.5640 (0.5876) data time 0.0007 (0.0021) model time 0.5633 (0.5873) loss 9.0004 (7.9214) grad_norm 1.7940 (2.1523) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:29:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][340/625] eta 0:02:47 lr 0.000958 wd 0.0500 time 0.5620 (0.5873) data time 0.0008 (0.0020) model time 0.5612 (0.5868) loss 8.9174 (7.9229) grad_norm 1.5765 (2.1493) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:29:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][350/625] eta 0:02:41 lr 0.000958 wd 0.0500 time 0.5624 (0.5873) data time 0.0008 (0.0020) model time 0.5616 (0.5869) loss 8.3353 (7.9213) grad_norm 2.0690 (2.1456) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:29:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][360/625] eta 0:02:35 lr 0.000957 wd 0.0500 time 0.5658 (0.5871) data time 0.0008 (0.0020) model time 0.5649 (0.5866) loss 6.4765 (7.9006) grad_norm 1.5700 (2.1376) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:29:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][370/625] eta 0:02:29 lr 0.000957 wd 0.0500 time 0.5666 (0.5869) data time 0.0008 (0.0020) model time 0.5659 (0.5863) loss 8.0265 (7.9057) grad_norm 1.8397 (2.1326) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:29:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][380/625] eta 0:02:23 lr 0.000957 wd 0.0500 time 0.5678 (0.5867) data time 0.0007 (0.0019) model time 0.5672 (0.5861) loss 8.4426 (7.9084) grad_norm 1.8368 (2.1336) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:30:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][390/625] eta 0:02:17 lr 0.000957 wd 0.0500 time 0.5672 (0.5864) data time 0.0006 (0.0019) model time 0.5666 (0.5858) loss 7.7699 (7.9029) grad_norm 2.3990 (2.1349) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:30:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][400/625] eta 0:02:11 lr 0.000957 wd 0.0500 time 0.5698 (0.5865) data time 0.0007 (0.0019) model time 0.5690 (0.5859) loss 8.3805 (7.9028) grad_norm 1.9895 (2.1327) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:30:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][410/625] eta 0:02:06 lr 0.000957 wd 0.0500 time 0.5647 (0.5863) data time 0.0008 (0.0018) model time 0.5639 (0.5856) loss 8.8514 (7.9048) grad_norm 1.6575 (2.1247) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:30:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][420/625] eta 0:02:00 lr 0.000957 wd 0.0500 time 0.5690 (0.5861) data time 0.0006 (0.0018) model time 0.5684 (0.5854) loss 8.5537 (7.9144) grad_norm 1.8850 (2.1189) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:30:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][430/625] eta 0:01:54 lr 0.000957 wd 0.0500 time 0.5674 (0.5860) data time 0.0006 (0.0018) model time 0.5668 (0.5852) loss 8.6664 (7.9186) grad_norm 2.1028 (2.1218) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:30:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][440/625] eta 0:01:48 lr 0.000957 wd 0.0500 time 0.5733 (0.5858) data time 0.0008 (0.0018) model time 0.5725 (0.5851) loss 8.4000 (7.9209) grad_norm 2.0257 (2.1200) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:30:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][450/625] eta 0:01:42 lr 0.000957 wd 0.0500 time 0.5654 (0.5856) data time 0.0006 (0.0018) model time 0.5649 (0.5849) loss 6.6063 (7.9220) grad_norm 3.0657 (2.1379) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:30:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][460/625] eta 0:01:36 lr 0.000957 wd 0.0500 time 0.5669 (0.5855) data time 0.0008 (0.0017) model time 0.5661 (0.5847) loss 6.6763 (7.9179) grad_norm 1.8121 (2.1439) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:30:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][470/625] eta 0:01:30 lr 0.000956 wd 0.0500 time 0.5676 (0.5853) data time 0.0008 (0.0017) model time 0.5668 (0.5845) loss 8.9780 (7.9240) grad_norm 1.7804 (2.1417) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:30:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][480/625] eta 0:01:24 lr 0.000956 wd 0.0500 time 0.5684 (0.5851) data time 0.0006 (0.0017) model time 0.5678 (0.5843) loss 7.4682 (7.9237) grad_norm 1.5917 (2.1364) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:31:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][490/625] eta 0:01:18 lr 0.000956 wd 0.0500 time 0.5635 (0.5850) data time 0.0009 (0.0017) model time 0.5627 (0.5841) loss 7.9032 (7.9259) grad_norm 1.4318 (2.1329) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:31:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][500/625] eta 0:01:13 lr 0.000956 wd 0.0500 time 0.5648 (0.5848) data time 0.0008 (0.0017) model time 0.5640 (0.5839) loss 9.0152 (7.9261) grad_norm 1.5464 (2.1270) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:31:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][510/625] eta 0:01:07 lr 0.000956 wd 0.0500 time 0.5650 (0.5847) data time 0.0006 (0.0017) model time 0.5644 (0.5838) loss 7.3849 (7.9232) grad_norm 2.0442 (2.1219) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:31:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][520/625] eta 0:01:01 lr 0.000956 wd 0.0500 time 0.5725 (0.5845) data time 0.0006 (0.0016) model time 0.5719 (0.5836) loss 8.2898 (7.9371) grad_norm 1.8800 (2.1213) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:31:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][530/625] eta 0:00:55 lr 0.000956 wd 0.0500 time 0.5644 (0.5854) data time 0.0009 (0.0016) model time 0.5635 (0.5846) loss 6.7898 (7.9296) grad_norm 1.6087 (2.1176) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:31:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][540/625] eta 0:00:49 lr 0.000956 wd 0.0500 time 0.7342 (0.5867) data time 0.0006 (0.0016) model time 0.7336 (0.5860) loss 9.7020 (7.9270) grad_norm 1.7681 (2.1105) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:31:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][550/625] eta 0:00:44 lr 0.000956 wd 0.0500 time 0.5684 (0.5868) data time 0.0008 (0.0016) model time 0.5676 (0.5862) loss 9.0331 (7.9307) grad_norm 1.7677 (2.1088) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:31:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][560/625] eta 0:00:38 lr 0.000956 wd 0.0500 time 0.5655 (0.5867) data time 0.0006 (0.0016) model time 0.5649 (0.5860) loss 8.2702 (7.9266) grad_norm 1.7409 (2.1135) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:31:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][570/625] eta 0:00:32 lr 0.000956 wd 0.0500 time 0.5629 (0.5865) data time 0.0008 (0.0016) model time 0.5621 (0.5858) loss 8.8645 (7.9229) grad_norm 4.4778 (2.1239) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:31:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][580/625] eta 0:00:26 lr 0.000956 wd 0.0500 time 0.5638 (0.5863) data time 0.0008 (0.0015) model time 0.5630 (0.5856) loss 7.1991 (7.9304) grad_norm 1.7933 (2.1266) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:31:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][590/625] eta 0:00:20 lr 0.000955 wd 0.0500 time 0.5674 (0.5862) data time 0.0007 (0.0015) model time 0.5667 (0.5854) loss 8.0494 (7.9229) grad_norm 3.2887 (2.1274) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:32:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][600/625] eta 0:00:14 lr 0.000955 wd 0.0500 time 0.5646 (0.5860) data time 0.0009 (0.0015) model time 0.5637 (0.5852) loss 8.6719 (7.9227) grad_norm 1.8086 (2.1323) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:32:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][610/625] eta 0:00:08 lr 0.000955 wd 0.0500 time 0.5673 (0.5859) data time 0.0004 (0.0015) model time 0.5669 (0.5851) loss 8.2945 (7.9218) grad_norm 1.4754 (2.1263) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:32:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [103/300][620/625] eta 0:00:02 lr 0.000955 wd 0.0500 time 0.5657 (0.5859) data time 0.0006 (0.0015) model time 0.5652 (0.5852) loss 7.0752 (7.9274) grad_norm 1.6199 (2.1319) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:32:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 103 training takes 0:06:06 +[2024-07-25 00:32:19 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 00:32:20 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 00:32:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.483 (0.483) Loss 0.5386 (0.5386) Acc@1 89.209 (89.209) Acc@5 98.584 (98.584) Mem 22339MB +[2024-07-25 00:32:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.159) Loss 0.8892 (0.6719) Acc@1 78.320 (85.067) Acc@5 95.752 (97.528) Mem 22339MB +[2024-07-25 00:32:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 1.0020 (0.7988) Acc@1 75.488 (81.801) Acc@5 94.141 (96.089) Mem 22339MB +[2024-07-25 00:32:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.450 Acc@5 96.041 +[2024-07-25 00:32:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.4% +[2024-07-25 00:32:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 81.45% +[2024-07-25 00:32:24 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 00:32:25 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 00:32:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.483 (0.483) Loss 0.5205 (0.5205) Acc@1 89.404 (89.404) Acc@5 98.486 (98.486) Mem 22339MB +[2024-07-25 00:32:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.8374 (0.6543) Acc@1 80.273 (85.884) Acc@5 96.143 (97.643) Mem 22339MB +[2024-07-25 00:32:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9453 (0.7673) Acc@1 76.221 (82.592) Acc@5 95.068 (96.431) Mem 22339MB +[2024-07-25 00:32:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.284 Acc@5 96.443 +[2024-07-25 00:32:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.3% +[2024-07-25 00:32:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][0/625] eta 0:16:04 lr 0.000955 wd 0.0500 time 1.5432 (1.5432) data time 0.5378 (0.5378) model time 0.0000 (0.0000) loss 8.2470 (8.2470) grad_norm 1.5169 (1.5169) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:32:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][10/625] eta 0:06:49 lr 0.000955 wd 0.0500 time 0.5779 (0.6654) data time 0.0008 (0.0496) model time 0.0000 (0.0000) loss 7.6160 (8.0503) grad_norm 1.5344 (1.6770) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:32:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][20/625] eta 0:06:16 lr 0.000955 wd 0.0500 time 0.5829 (0.6229) data time 0.0006 (0.0264) model time 0.0000 (0.0000) loss 7.8557 (7.9658) grad_norm 3.0114 (2.0422) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:32:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][30/625] eta 0:06:01 lr 0.000955 wd 0.0500 time 0.5826 (0.6081) data time 0.0008 (0.0181) model time 0.0000 (0.0000) loss 8.0689 (8.0377) grad_norm 1.9960 (2.1405) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:32:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][40/625] eta 0:05:51 lr 0.000955 wd 0.0500 time 0.5769 (0.6006) data time 0.0008 (0.0139) model time 0.0000 (0.0000) loss 8.6425 (8.0795) grad_norm 1.7617 (2.0789) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:32:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][50/625] eta 0:05:42 lr 0.000955 wd 0.0500 time 0.5836 (0.5961) data time 0.0006 (0.0113) model time 0.0000 (0.0000) loss 7.9613 (8.0451) grad_norm 2.5750 (2.1213) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:33:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][60/625] eta 0:05:35 lr 0.000955 wd 0.0500 time 0.5823 (0.5933) data time 0.0007 (0.0096) model time 0.5815 (0.5780) loss 8.1754 (8.0033) grad_norm 1.6338 (2.0881) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:33:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][70/625] eta 0:05:28 lr 0.000955 wd 0.0500 time 0.5806 (0.5912) data time 0.0008 (0.0084) model time 0.5798 (0.5777) loss 6.9674 (7.9815) grad_norm 1.9225 (2.0690) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:33:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][80/625] eta 0:05:21 lr 0.000954 wd 0.0500 time 0.5771 (0.5893) data time 0.0006 (0.0075) model time 0.5765 (0.5768) loss 7.3734 (7.9667) grad_norm 1.9926 (2.1069) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:33:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][90/625] eta 0:05:14 lr 0.000954 wd 0.0500 time 0.5832 (0.5883) data time 0.0008 (0.0067) model time 0.5824 (0.5776) loss 9.0493 (7.9633) grad_norm 2.1766 (2.0662) loss_scale 4096.0000 (2205.5385) mem 22339MB +[2024-07-25 00:33:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][100/625] eta 0:05:08 lr 0.000954 wd 0.0500 time 0.5797 (0.5872) data time 0.0008 (0.0061) model time 0.5789 (0.5773) loss 8.1281 (7.9304) grad_norm 2.0585 (2.0331) loss_scale 4096.0000 (2392.7129) mem 22339MB +[2024-07-25 00:33:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][110/625] eta 0:05:01 lr 0.000954 wd 0.0500 time 0.5849 (0.5864) data time 0.0009 (0.0057) model time 0.5841 (0.5772) loss 8.4235 (7.9570) grad_norm 2.0997 (2.0405) loss_scale 4096.0000 (2546.1622) mem 22339MB +[2024-07-25 00:33:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][120/625] eta 0:04:57 lr 0.000954 wd 0.0500 time 0.7523 (0.5885) data time 0.0009 (0.0053) model time 0.7514 (0.5821) loss 8.2234 (7.9458) grad_norm 2.6974 (2.0586) loss_scale 4096.0000 (2674.2479) mem 22339MB +[2024-07-25 00:33:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][130/625] eta 0:04:54 lr 0.000954 wd 0.0500 time 0.7476 (0.5947) data time 0.0006 (0.0049) model time 0.7469 (0.5930) loss 6.3074 (7.9297) grad_norm 1.7561 (2.0567) loss_scale 4096.0000 (2782.7786) mem 22339MB +[2024-07-25 00:33:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][140/625] eta 0:04:49 lr 0.000954 wd 0.0500 time 0.5816 (0.5976) data time 0.0007 (0.0046) model time 0.5809 (0.5976) loss 6.4377 (7.9123) grad_norm 1.8428 (2.0731) loss_scale 4096.0000 (2875.9149) mem 22339MB +[2024-07-25 00:33:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][150/625] eta 0:04:43 lr 0.000954 wd 0.0500 time 0.5883 (0.5968) data time 0.0008 (0.0044) model time 0.5875 (0.5963) loss 7.4049 (7.8882) grad_norm 1.5560 (2.0799) loss_scale 4096.0000 (2956.7152) mem 22339MB +[2024-07-25 00:34:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][160/625] eta 0:04:36 lr 0.000954 wd 0.0500 time 0.5878 (0.5956) data time 0.0008 (0.0042) model time 0.5870 (0.5946) loss 7.9154 (7.8991) grad_norm 2.4184 (2.0626) loss_scale 4096.0000 (3027.4783) mem 22339MB +[2024-07-25 00:34:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][170/625] eta 0:04:30 lr 0.000954 wd 0.0500 time 0.5820 (0.5946) data time 0.0006 (0.0040) model time 0.5814 (0.5931) loss 7.0109 (7.8812) grad_norm 1.6514 (2.0507) loss_scale 4096.0000 (3089.9649) mem 22339MB +[2024-07-25 00:34:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][180/625] eta 0:04:24 lr 0.000954 wd 0.0500 time 0.5870 (0.5937) data time 0.0007 (0.0038) model time 0.5863 (0.5919) loss 8.8235 (7.9051) grad_norm 2.5967 (2.0686) loss_scale 4096.0000 (3145.5470) mem 22339MB +[2024-07-25 00:34:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][190/625] eta 0:04:17 lr 0.000954 wd 0.0500 time 0.5849 (0.5929) data time 0.0008 (0.0036) model time 0.5841 (0.5908) loss 7.1216 (7.9087) grad_norm 1.8217 (inf) loss_scale 2048.0000 (3152.4188) mem 22339MB +[2024-07-25 00:34:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][200/625] eta 0:04:11 lr 0.000953 wd 0.0500 time 0.5792 (0.5921) data time 0.0006 (0.0035) model time 0.5786 (0.5900) loss 8.7911 (7.8909) grad_norm 1.6352 (inf) loss_scale 2048.0000 (3097.4726) mem 22339MB +[2024-07-25 00:34:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][210/625] eta 0:04:05 lr 0.000953 wd 0.0500 time 0.5789 (0.5914) data time 0.0006 (0.0034) model time 0.5783 (0.5891) loss 8.4091 (7.8877) grad_norm 2.3096 (inf) loss_scale 2048.0000 (3047.7346) mem 22339MB +[2024-07-25 00:34:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][220/625] eta 0:03:59 lr 0.000953 wd 0.0500 time 0.5870 (0.5908) data time 0.0008 (0.0033) model time 0.5862 (0.5884) loss 8.4375 (7.9061) grad_norm 1.9058 (inf) loss_scale 2048.0000 (3002.4977) mem 22339MB +[2024-07-25 00:34:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][230/625] eta 0:03:53 lr 0.000953 wd 0.0500 time 0.5769 (0.5901) data time 0.0006 (0.0032) model time 0.5763 (0.5876) loss 9.0304 (7.8870) grad_norm 1.9673 (inf) loss_scale 2048.0000 (2961.1775) mem 22339MB +[2024-07-25 00:34:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][240/625] eta 0:03:46 lr 0.000953 wd 0.0500 time 0.5853 (0.5896) data time 0.0008 (0.0031) model time 0.5845 (0.5870) loss 9.4016 (7.9075) grad_norm 1.9865 (inf) loss_scale 2048.0000 (2923.2863) mem 22339MB +[2024-07-25 00:34:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][250/625] eta 0:03:40 lr 0.000953 wd 0.0500 time 0.5836 (0.5892) data time 0.0006 (0.0030) model time 0.5830 (0.5866) loss 5.9556 (7.8609) grad_norm 2.0047 (inf) loss_scale 2048.0000 (2888.4143) mem 22339MB +[2024-07-25 00:35:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][260/625] eta 0:03:34 lr 0.000953 wd 0.0500 time 0.5865 (0.5888) data time 0.0006 (0.0029) model time 0.5859 (0.5862) loss 6.5282 (7.8324) grad_norm 2.1949 (inf) loss_scale 2048.0000 (2856.2146) mem 22339MB +[2024-07-25 00:35:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][270/625] eta 0:03:28 lr 0.000953 wd 0.0500 time 0.5855 (0.5884) data time 0.0006 (0.0028) model time 0.5848 (0.5858) loss 7.7732 (7.8452) grad_norm 2.1456 (inf) loss_scale 2048.0000 (2826.3911) mem 22339MB +[2024-07-25 00:35:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][280/625] eta 0:03:22 lr 0.000953 wd 0.0500 time 0.5871 (0.5881) data time 0.0008 (0.0027) model time 0.5864 (0.5854) loss 8.5847 (7.8358) grad_norm 2.3780 (inf) loss_scale 2048.0000 (2798.6904) mem 22339MB +[2024-07-25 00:35:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][290/625] eta 0:03:16 lr 0.000953 wd 0.0500 time 0.6232 (0.5879) data time 0.0006 (0.0027) model time 0.6226 (0.5853) loss 8.5030 (7.8385) grad_norm 1.8516 (inf) loss_scale 2048.0000 (2772.8935) mem 22339MB +[2024-07-25 00:35:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][300/625] eta 0:03:10 lr 0.000953 wd 0.0500 time 0.5896 (0.5875) data time 0.0008 (0.0027) model time 0.5888 (0.5849) loss 6.5044 (7.8197) grad_norm 1.8898 (inf) loss_scale 2048.0000 (2748.8106) mem 22339MB +[2024-07-25 00:35:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][310/625] eta 0:03:04 lr 0.000952 wd 0.0500 time 0.5865 (0.5872) data time 0.0007 (0.0026) model time 0.5858 (0.5846) loss 8.5273 (7.8210) grad_norm 3.2195 (inf) loss_scale 2048.0000 (2726.2765) mem 22339MB +[2024-07-25 00:35:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][320/625] eta 0:02:59 lr 0.000952 wd 0.0500 time 0.5971 (0.5870) data time 0.0006 (0.0026) model time 0.5965 (0.5843) loss 6.0126 (7.8179) grad_norm 2.6604 (inf) loss_scale 2048.0000 (2705.1464) mem 22339MB +[2024-07-25 00:35:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][330/625] eta 0:02:53 lr 0.000952 wd 0.0500 time 0.6051 (0.5868) data time 0.0007 (0.0025) model time 0.6044 (0.5842) loss 8.0788 (7.8258) grad_norm 2.6304 (inf) loss_scale 2048.0000 (2685.2931) mem 22339MB +[2024-07-25 00:35:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][340/625] eta 0:02:47 lr 0.000952 wd 0.0500 time 0.7112 (0.5873) data time 0.0006 (0.0025) model time 0.7106 (0.5848) loss 8.4313 (7.8171) grad_norm 2.2832 (inf) loss_scale 2048.0000 (2666.6041) mem 22339MB +[2024-07-25 00:35:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][350/625] eta 0:02:41 lr 0.000952 wd 0.0500 time 0.7040 (0.5886) data time 0.0006 (0.0025) model time 0.7033 (0.5863) loss 7.4375 (7.8125) grad_norm 1.8403 (inf) loss_scale 2048.0000 (2648.9801) mem 22339MB +[2024-07-25 00:36:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][360/625] eta 0:02:36 lr 0.000952 wd 0.0500 time 0.5856 (0.5894) data time 0.0008 (0.0024) model time 0.5849 (0.5874) loss 9.2439 (7.8299) grad_norm 1.4893 (inf) loss_scale 2048.0000 (2632.3324) mem 22339MB +[2024-07-25 00:36:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][370/625] eta 0:02:30 lr 0.000952 wd 0.0500 time 0.5814 (0.5892) data time 0.0007 (0.0024) model time 0.5807 (0.5872) loss 6.3610 (7.8158) grad_norm 1.9815 (inf) loss_scale 2048.0000 (2616.5822) mem 22339MB +[2024-07-25 00:36:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][380/625] eta 0:02:24 lr 0.000952 wd 0.0500 time 0.5793 (0.5889) data time 0.0006 (0.0023) model time 0.5787 (0.5869) loss 7.9654 (7.8174) grad_norm 1.5300 (inf) loss_scale 2048.0000 (2601.6588) mem 22339MB +[2024-07-25 00:36:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][390/625] eta 0:02:18 lr 0.000952 wd 0.0500 time 0.5831 (0.5887) data time 0.0006 (0.0023) model time 0.5825 (0.5866) loss 6.9400 (7.8170) grad_norm 1.8717 (inf) loss_scale 2048.0000 (2587.4987) mem 22339MB +[2024-07-25 00:36:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][400/625] eta 0:02:12 lr 0.000952 wd 0.0500 time 0.5835 (0.5884) data time 0.0006 (0.0023) model time 0.5829 (0.5863) loss 6.5155 (7.8115) grad_norm 2.0566 (inf) loss_scale 2048.0000 (2574.0449) mem 22339MB +[2024-07-25 00:36:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][410/625] eta 0:02:06 lr 0.000952 wd 0.0500 time 0.5796 (0.5881) data time 0.0006 (0.0022) model time 0.5790 (0.5860) loss 7.3841 (7.8026) grad_norm 1.4684 (inf) loss_scale 2048.0000 (2561.2457) mem 22339MB +[2024-07-25 00:36:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][420/625] eta 0:02:00 lr 0.000952 wd 0.0500 time 0.5836 (0.5879) data time 0.0008 (0.0022) model time 0.5828 (0.5858) loss 8.8639 (7.8144) grad_norm 2.2064 (inf) loss_scale 2048.0000 (2549.0546) mem 22339MB +[2024-07-25 00:36:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][430/625] eta 0:01:54 lr 0.000951 wd 0.0500 time 0.5741 (0.5876) data time 0.0008 (0.0022) model time 0.5733 (0.5856) loss 6.4805 (7.8050) grad_norm 1.6012 (inf) loss_scale 2048.0000 (2537.4292) mem 22339MB +[2024-07-25 00:36:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][440/625] eta 0:01:48 lr 0.000951 wd 0.0500 time 0.5937 (0.5874) data time 0.0006 (0.0021) model time 0.5931 (0.5854) loss 7.3753 (7.8117) grad_norm 1.5821 (inf) loss_scale 2048.0000 (2526.3311) mem 22339MB +[2024-07-25 00:36:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][450/625] eta 0:01:42 lr 0.000951 wd 0.0500 time 0.5789 (0.5872) data time 0.0006 (0.0021) model time 0.5783 (0.5851) loss 6.8086 (7.8178) grad_norm 1.5748 (inf) loss_scale 2048.0000 (2515.7251) mem 22339MB +[2024-07-25 00:36:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][460/625] eta 0:01:36 lr 0.000951 wd 0.0500 time 0.5895 (0.5870) data time 0.0008 (0.0021) model time 0.5887 (0.5850) loss 9.4141 (7.8275) grad_norm 2.3347 (inf) loss_scale 2048.0000 (2505.5792) mem 22339MB +[2024-07-25 00:37:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][470/625] eta 0:01:30 lr 0.000951 wd 0.0500 time 0.5811 (0.5868) data time 0.0008 (0.0021) model time 0.5803 (0.5848) loss 7.0586 (7.8404) grad_norm 1.6381 (inf) loss_scale 2048.0000 (2495.8641) mem 22339MB +[2024-07-25 00:37:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][480/625] eta 0:01:25 lr 0.000951 wd 0.0500 time 0.5852 (0.5866) data time 0.0006 (0.0020) model time 0.5846 (0.5846) loss 8.4919 (7.8376) grad_norm 2.1902 (inf) loss_scale 2048.0000 (2486.5530) mem 22339MB +[2024-07-25 00:37:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][490/625] eta 0:01:19 lr 0.000951 wd 0.0500 time 0.5791 (0.5864) data time 0.0007 (0.0020) model time 0.5784 (0.5844) loss 7.7740 (7.8358) grad_norm 1.7968 (inf) loss_scale 2048.0000 (2477.6212) mem 22339MB +[2024-07-25 00:37:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][500/625] eta 0:01:13 lr 0.000951 wd 0.0500 time 0.5783 (0.5862) data time 0.0008 (0.0020) model time 0.5775 (0.5842) loss 8.4435 (7.8373) grad_norm 1.5219 (inf) loss_scale 2048.0000 (2469.0459) mem 22339MB +[2024-07-25 00:37:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][510/625] eta 0:01:07 lr 0.000951 wd 0.0500 time 0.5828 (0.5861) data time 0.0008 (0.0020) model time 0.5819 (0.5840) loss 8.7269 (7.8433) grad_norm 1.7124 (inf) loss_scale 2048.0000 (2460.8063) mem 22339MB +[2024-07-25 00:37:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][520/625] eta 0:01:01 lr 0.000951 wd 0.0500 time 0.5784 (0.5859) data time 0.0006 (0.0019) model time 0.5778 (0.5839) loss 6.7999 (7.8473) grad_norm 2.2207 (inf) loss_scale 2048.0000 (2452.8829) mem 22339MB +[2024-07-25 00:37:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][530/625] eta 0:00:55 lr 0.000951 wd 0.0500 time 0.5772 (0.5857) data time 0.0008 (0.0019) model time 0.5764 (0.5837) loss 8.4531 (7.8500) grad_norm 1.7250 (inf) loss_scale 2048.0000 (2445.2580) mem 22339MB +[2024-07-25 00:37:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][540/625] eta 0:00:49 lr 0.000950 wd 0.0500 time 0.5886 (0.5856) data time 0.0006 (0.0019) model time 0.5880 (0.5836) loss 7.2777 (7.8443) grad_norm 2.0890 (inf) loss_scale 2048.0000 (2437.9150) mem 22339MB +[2024-07-25 00:37:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][550/625] eta 0:00:43 lr 0.000950 wd 0.0500 time 0.5810 (0.5855) data time 0.0008 (0.0019) model time 0.5802 (0.5835) loss 8.0178 (7.8460) grad_norm 2.2136 (inf) loss_scale 2048.0000 (2430.8385) mem 22339MB +[2024-07-25 00:37:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][560/625] eta 0:00:38 lr 0.000950 wd 0.0500 time 0.5805 (0.5858) data time 0.0008 (0.0019) model time 0.5797 (0.5839) loss 8.4373 (7.8474) grad_norm 1.5930 (inf) loss_scale 2048.0000 (2424.0143) mem 22339MB +[2024-07-25 00:38:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][570/625] eta 0:00:32 lr 0.000950 wd 0.0500 time 0.5786 (0.5869) data time 0.0008 (0.0018) model time 0.5778 (0.5851) loss 8.1105 (7.8476) grad_norm 1.7215 (inf) loss_scale 2048.0000 (2417.4291) mem 22339MB +[2024-07-25 00:38:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][580/625] eta 0:00:26 lr 0.000950 wd 0.0500 time 0.7284 (0.5881) data time 0.0008 (0.0018) model time 0.7276 (0.5865) loss 6.5641 (7.8413) grad_norm 1.8196 (inf) loss_scale 2048.0000 (2411.0706) mem 22339MB +[2024-07-25 00:38:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][590/625] eta 0:00:20 lr 0.000950 wd 0.0500 time 0.5776 (0.5882) data time 0.0007 (0.0018) model time 0.5769 (0.5865) loss 7.9343 (7.8408) grad_norm 1.4911 (inf) loss_scale 2048.0000 (2404.9272) mem 22339MB +[2024-07-25 00:38:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][600/625] eta 0:00:14 lr 0.000950 wd 0.0500 time 0.5833 (0.5880) data time 0.0008 (0.0018) model time 0.5826 (0.5864) loss 9.0569 (7.8370) grad_norm 1.5897 (inf) loss_scale 2048.0000 (2398.9884) mem 22339MB +[2024-07-25 00:38:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][610/625] eta 0:00:08 lr 0.000950 wd 0.0500 time 0.5797 (0.5879) data time 0.0004 (0.0018) model time 0.5792 (0.5862) loss 9.2384 (7.8405) grad_norm 2.1319 (inf) loss_scale 2048.0000 (2393.2439) mem 22339MB +[2024-07-25 00:38:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [104/300][620/625] eta 0:00:02 lr 0.000950 wd 0.0500 time 0.5828 (0.5877) data time 0.0006 (0.0018) model time 0.5822 (0.5861) loss 9.1054 (7.8428) grad_norm 2.1309 (inf) loss_scale 2048.0000 (2387.6844) mem 22339MB +[2024-07-25 00:38:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 104 training takes 0:06:07 +[2024-07-25 00:38:36 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 00:38:37 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 00:38:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.476 (0.476) Loss 0.5303 (0.5303) Acc@1 88.135 (88.135) Acc@5 98.682 (98.682) Mem 22339MB +[2024-07-25 00:38:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.9077 (0.6820) Acc@1 78.516 (85.138) Acc@5 95.068 (97.430) Mem 22339MB +[2024-07-25 00:38:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 1.0225 (0.8064) Acc@1 75.049 (81.806) Acc@5 94.043 (96.045) Mem 22339MB +[2024-07-25 00:38:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.448 Acc@5 96.013 +[2024-07-25 00:38:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.4% +[2024-07-25 00:38:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.840 (0.840) Loss 0.5176 (0.5176) Acc@1 89.404 (89.404) Acc@5 98.486 (98.486) Mem 22339MB +[2024-07-25 00:38:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.191) Loss 0.8340 (0.6523) Acc@1 80.225 (85.858) Acc@5 96.094 (97.652) Mem 22339MB +[2024-07-25 00:38:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.160) Loss 0.9438 (0.7654) Acc@1 76.025 (82.575) Acc@5 94.922 (96.438) Mem 22339MB +[2024-07-25 00:38:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.276 Acc@5 96.447 +[2024-07-25 00:38:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.3% +[2024-07-25 00:38:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][0/625] eta 0:14:09 lr 0.000950 wd 0.0500 time 1.3594 (1.3594) data time 0.7225 (0.7225) model time 0.0000 (0.0000) loss 6.7350 (6.7350) grad_norm 1.9957 (1.9957) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:38:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][10/625] eta 0:06:38 lr 0.000950 wd 0.0500 time 0.5786 (0.6482) data time 0.0006 (0.0665) model time 0.0000 (0.0000) loss 8.1706 (7.7896) grad_norm 1.4347 (1.8759) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:38:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][20/625] eta 0:06:11 lr 0.000950 wd 0.0500 time 0.5821 (0.6138) data time 0.0008 (0.0352) model time 0.0000 (0.0000) loss 8.5938 (7.9620) grad_norm 1.5881 (1.9806) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:39:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][30/625] eta 0:05:57 lr 0.000949 wd 0.0500 time 0.5785 (0.6017) data time 0.0006 (0.0242) model time 0.0000 (0.0000) loss 8.8124 (7.8563) grad_norm 2.0354 (2.0496) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:39:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][40/625] eta 0:05:49 lr 0.000949 wd 0.0500 time 0.5798 (0.5971) data time 0.0006 (0.0200) model time 0.0000 (0.0000) loss 8.8663 (7.9614) grad_norm 2.1770 (2.0954) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:39:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][50/625] eta 0:05:41 lr 0.000949 wd 0.0500 time 0.5813 (0.5931) data time 0.0006 (0.0162) model time 0.0000 (0.0000) loss 6.7399 (7.9651) grad_norm 2.1762 (2.1490) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:39:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][60/625] eta 0:05:33 lr 0.000949 wd 0.0500 time 0.5805 (0.5904) data time 0.0008 (0.0137) model time 0.5797 (0.5759) loss 7.5431 (7.9293) grad_norm 2.3144 (2.1361) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:39:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][70/625] eta 0:05:26 lr 0.000949 wd 0.0500 time 0.5778 (0.5884) data time 0.0006 (0.0119) model time 0.5772 (0.5755) loss 8.4974 (7.9161) grad_norm 2.3146 (2.1325) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:39:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][80/625] eta 0:05:19 lr 0.000949 wd 0.0500 time 0.5854 (0.5871) data time 0.0008 (0.0106) model time 0.5846 (0.5760) loss 7.2702 (7.8952) grad_norm 1.8005 (2.1615) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:39:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][90/625] eta 0:05:13 lr 0.000949 wd 0.0500 time 0.5828 (0.5859) data time 0.0006 (0.0095) model time 0.5822 (0.5758) loss 8.7365 (7.8763) grad_norm 1.6053 (2.1447) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:39:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][100/625] eta 0:05:07 lr 0.000949 wd 0.0500 time 0.5788 (0.5849) data time 0.0009 (0.0086) model time 0.5779 (0.5757) loss 8.5926 (7.8784) grad_norm 1.9418 (2.1301) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:39:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][110/625] eta 0:05:01 lr 0.000949 wd 0.0500 time 0.5787 (0.5846) data time 0.0008 (0.0079) model time 0.5780 (0.5765) loss 8.1307 (7.9242) grad_norm 3.0243 (2.1591) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:39:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][120/625] eta 0:04:54 lr 0.000949 wd 0.0500 time 0.5796 (0.5840) data time 0.0007 (0.0073) model time 0.5788 (0.5765) loss 8.0695 (7.9233) grad_norm 1.6526 (2.1732) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:40:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][130/625] eta 0:04:48 lr 0.000949 wd 0.0500 time 0.5863 (0.5836) data time 0.0006 (0.0068) model time 0.5856 (0.5767) loss 7.4042 (7.8651) grad_norm 2.9976 (2.1773) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:40:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][140/625] eta 0:04:42 lr 0.000949 wd 0.0500 time 0.5935 (0.5833) data time 0.0007 (0.0064) model time 0.5929 (0.5769) loss 6.0122 (7.8103) grad_norm 1.9200 (2.1672) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:40:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][150/625] eta 0:04:36 lr 0.000948 wd 0.0500 time 0.5791 (0.5829) data time 0.0009 (0.0060) model time 0.5781 (0.5769) loss 8.9646 (7.8254) grad_norm 2.2464 (2.1635) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:40:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][160/625] eta 0:04:32 lr 0.000948 wd 0.0500 time 0.7553 (0.5861) data time 0.0006 (0.0057) model time 0.7547 (0.5819) loss 8.2424 (7.8640) grad_norm 2.4177 (2.1532) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:40:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][170/625] eta 0:04:28 lr 0.000948 wd 0.0500 time 0.7153 (0.5910) data time 0.0007 (0.0054) model time 0.7146 (0.5892) loss 6.7994 (7.8482) grad_norm 1.9983 (2.1367) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:40:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][180/625] eta 0:04:23 lr 0.000948 wd 0.0500 time 0.5846 (0.5916) data time 0.0007 (0.0052) model time 0.5838 (0.5902) loss 9.3123 (7.8468) grad_norm 3.4223 (2.1591) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:40:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][190/625] eta 0:04:17 lr 0.000948 wd 0.0500 time 0.5863 (0.5909) data time 0.0006 (0.0050) model time 0.5857 (0.5893) loss 9.0329 (7.8582) grad_norm 1.6297 (2.1572) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:40:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][200/625] eta 0:04:10 lr 0.000948 wd 0.0500 time 0.5814 (0.5903) data time 0.0006 (0.0048) model time 0.5807 (0.5885) loss 7.8121 (7.8507) grad_norm 2.5539 (2.1681) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:40:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][210/625] eta 0:04:04 lr 0.000948 wd 0.0500 time 0.5830 (0.5898) data time 0.0006 (0.0046) model time 0.5824 (0.5879) loss 9.1210 (7.8466) grad_norm 1.9886 (2.1716) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:40:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][220/625] eta 0:03:58 lr 0.000948 wd 0.0500 time 0.5779 (0.5892) data time 0.0006 (0.0044) model time 0.5773 (0.5872) loss 8.7018 (7.8435) grad_norm 1.5222 (2.1596) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:41:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][230/625] eta 0:03:52 lr 0.000948 wd 0.0500 time 0.5760 (0.5887) data time 0.0008 (0.0042) model time 0.5752 (0.5867) loss 6.9442 (7.8394) grad_norm 2.6788 (2.1614) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:41:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][240/625] eta 0:03:46 lr 0.000948 wd 0.0500 time 0.5925 (0.5883) data time 0.0006 (0.0041) model time 0.5919 (0.5862) loss 6.5027 (7.8248) grad_norm 2.1031 (2.1576) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:41:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][250/625] eta 0:03:40 lr 0.000948 wd 0.0500 time 0.5813 (0.5879) data time 0.0006 (0.0040) model time 0.5807 (0.5858) loss 8.8678 (7.8418) grad_norm 2.3788 (2.1642) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:41:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][260/625] eta 0:03:34 lr 0.000947 wd 0.0500 time 0.5850 (0.5876) data time 0.0006 (0.0038) model time 0.5844 (0.5854) loss 7.5139 (7.8374) grad_norm 1.9754 (2.1670) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:41:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][270/625] eta 0:03:28 lr 0.000947 wd 0.0500 time 0.5805 (0.5872) data time 0.0008 (0.0037) model time 0.5797 (0.5851) loss 7.2679 (7.8269) grad_norm 2.8417 (2.1694) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:41:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][280/625] eta 0:03:22 lr 0.000947 wd 0.0500 time 0.5844 (0.5869) data time 0.0008 (0.0036) model time 0.5836 (0.5847) loss 8.3849 (7.8378) grad_norm 2.1839 (2.1667) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:41:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][290/625] eta 0:03:16 lr 0.000947 wd 0.0500 time 0.5787 (0.5866) data time 0.0007 (0.0035) model time 0.5780 (0.5844) loss 9.8206 (7.8459) grad_norm 1.7599 (2.1608) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:41:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][300/625] eta 0:03:10 lr 0.000947 wd 0.0500 time 0.5793 (0.5863) data time 0.0007 (0.0034) model time 0.5786 (0.5841) loss 7.9632 (7.8610) grad_norm 2.5739 (2.1705) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:41:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][310/625] eta 0:03:04 lr 0.000947 wd 0.0500 time 0.5826 (0.5860) data time 0.0006 (0.0034) model time 0.5820 (0.5838) loss 6.3818 (7.8670) grad_norm 1.9349 (2.1739) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:41:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][320/625] eta 0:02:58 lr 0.000947 wd 0.0500 time 0.5857 (0.5857) data time 0.0008 (0.0033) model time 0.5849 (0.5835) loss 7.8981 (7.8725) grad_norm 1.5848 (2.1714) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:41:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][330/625] eta 0:02:52 lr 0.000947 wd 0.0500 time 0.5799 (0.5855) data time 0.0006 (0.0032) model time 0.5794 (0.5833) loss 8.2211 (7.8895) grad_norm 2.7378 (2.1696) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:42:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][340/625] eta 0:02:46 lr 0.000947 wd 0.0500 time 0.5827 (0.5853) data time 0.0008 (0.0031) model time 0.5819 (0.5831) loss 7.9330 (7.8901) grad_norm 2.0988 (2.1771) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:42:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][350/625] eta 0:02:40 lr 0.000947 wd 0.0500 time 0.5848 (0.5851) data time 0.0006 (0.0031) model time 0.5842 (0.5829) loss 7.2467 (7.8842) grad_norm 1.5931 (2.1707) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:42:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][360/625] eta 0:02:34 lr 0.000947 wd 0.0500 time 0.5778 (0.5849) data time 0.0009 (0.0030) model time 0.5770 (0.5827) loss 9.1278 (7.8846) grad_norm 1.6124 (2.1594) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:42:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][370/625] eta 0:02:29 lr 0.000947 wd 0.0500 time 0.5871 (0.5847) data time 0.0008 (0.0029) model time 0.5863 (0.5825) loss 7.9154 (7.8858) grad_norm 2.5118 (2.1578) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:42:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][380/625] eta 0:02:23 lr 0.000946 wd 0.0500 time 0.7400 (0.5857) data time 0.0008 (0.0029) model time 0.7392 (0.5837) loss 9.1572 (7.8819) grad_norm 2.4525 (2.1517) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:42:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][390/625] eta 0:02:18 lr 0.000946 wd 0.0500 time 0.7005 (0.5879) data time 0.0007 (0.0028) model time 0.6998 (0.5863) loss 6.7225 (7.8741) grad_norm 1.4999 (2.1499) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:42:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][400/625] eta 0:02:12 lr 0.000946 wd 0.0500 time 0.5804 (0.5882) data time 0.0006 (0.0028) model time 0.5798 (0.5866) loss 8.1497 (7.8746) grad_norm 1.5757 (2.1429) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:42:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][410/625] eta 0:02:06 lr 0.000946 wd 0.0500 time 0.5764 (0.5879) data time 0.0009 (0.0027) model time 0.5755 (0.5864) loss 7.9554 (7.8720) grad_norm 4.3774 (2.1460) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:42:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][420/625] eta 0:02:00 lr 0.000946 wd 0.0500 time 0.5814 (0.5877) data time 0.0008 (0.0027) model time 0.5806 (0.5861) loss 6.3892 (7.8685) grad_norm 1.7358 (2.1424) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:42:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][430/625] eta 0:01:54 lr 0.000946 wd 0.0500 time 0.5810 (0.5874) data time 0.0008 (0.0027) model time 0.5802 (0.5858) loss 9.0992 (7.8680) grad_norm 2.7684 (2.1359) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:43:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][440/625] eta 0:01:48 lr 0.000946 wd 0.0500 time 0.5798 (0.5872) data time 0.0008 (0.0026) model time 0.5790 (0.5856) loss 8.1392 (7.8687) grad_norm 1.8606 (2.1339) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:43:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][450/625] eta 0:01:42 lr 0.000946 wd 0.0500 time 0.5817 (0.5870) data time 0.0006 (0.0026) model time 0.5811 (0.5854) loss 8.4069 (7.8676) grad_norm 2.8144 (2.1392) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:43:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][460/625] eta 0:01:36 lr 0.000946 wd 0.0500 time 0.5791 (0.5867) data time 0.0006 (0.0025) model time 0.5785 (0.5851) loss 8.1605 (7.8696) grad_norm 1.9003 (2.1465) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:43:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][470/625] eta 0:01:30 lr 0.000946 wd 0.0500 time 0.5838 (0.5865) data time 0.0008 (0.0025) model time 0.5830 (0.5849) loss 7.3391 (7.8720) grad_norm 2.0469 (2.1552) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:43:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][480/625] eta 0:01:25 lr 0.000946 wd 0.0500 time 0.5817 (0.5863) data time 0.0007 (0.0025) model time 0.5810 (0.5847) loss 7.0237 (7.8605) grad_norm 2.2058 (2.1605) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:43:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][490/625] eta 0:01:19 lr 0.000945 wd 0.0500 time 0.5845 (0.5861) data time 0.0006 (0.0024) model time 0.5839 (0.5845) loss 6.7069 (7.8614) grad_norm 1.6719 (2.1632) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:43:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][500/625] eta 0:01:13 lr 0.000945 wd 0.0500 time 0.5796 (0.5859) data time 0.0006 (0.0024) model time 0.5789 (0.5843) loss 7.3786 (7.8593) grad_norm 2.1638 (2.1673) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:43:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][510/625] eta 0:01:07 lr 0.000945 wd 0.0500 time 0.5804 (0.5858) data time 0.0006 (0.0024) model time 0.5798 (0.5841) loss 7.6395 (7.8705) grad_norm 1.5132 (2.1632) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:43:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][520/625] eta 0:01:01 lr 0.000945 wd 0.0500 time 0.5722 (0.5856) data time 0.0008 (0.0023) model time 0.5714 (0.5839) loss 8.0442 (7.8677) grad_norm 3.4515 (2.1688) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:43:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][530/625] eta 0:00:55 lr 0.000945 wd 0.0500 time 0.5833 (0.5854) data time 0.0007 (0.0023) model time 0.5825 (0.5838) loss 9.3070 (7.8685) grad_norm 1.5682 (2.1674) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:44:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][540/625] eta 0:00:49 lr 0.000945 wd 0.0500 time 0.5845 (0.5853) data time 0.0006 (0.0023) model time 0.5840 (0.5836) loss 7.9592 (7.8664) grad_norm 2.5019 (2.1684) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:44:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][550/625] eta 0:00:43 lr 0.000945 wd 0.0500 time 0.5866 (0.5851) data time 0.0008 (0.0023) model time 0.5858 (0.5835) loss 8.3142 (7.8661) grad_norm 1.5048 (2.1642) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:44:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][560/625] eta 0:00:38 lr 0.000945 wd 0.0500 time 0.5790 (0.5850) data time 0.0007 (0.0022) model time 0.5782 (0.5834) loss 9.7652 (7.8650) grad_norm 2.8068 (2.1668) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:44:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][570/625] eta 0:00:32 lr 0.000945 wd 0.0500 time 0.5830 (0.5849) data time 0.0007 (0.0022) model time 0.5823 (0.5833) loss 8.5886 (7.8690) grad_norm 1.4461 (2.1654) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:44:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][580/625] eta 0:00:26 lr 0.000945 wd 0.0500 time 0.5795 (0.5848) data time 0.0009 (0.0022) model time 0.5786 (0.5831) loss 6.1487 (7.8587) grad_norm 1.9167 (2.1598) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:44:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][590/625] eta 0:00:20 lr 0.000945 wd 0.0500 time 0.5812 (0.5847) data time 0.0006 (0.0022) model time 0.5806 (0.5830) loss 8.9775 (7.8528) grad_norm 1.8993 (2.1602) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:44:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][600/625] eta 0:00:14 lr 0.000944 wd 0.0500 time 0.7775 (0.5859) data time 0.0006 (0.0021) model time 0.7768 (0.5844) loss 8.6300 (7.8494) grad_norm 1.6542 (2.1608) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:44:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][610/625] eta 0:00:08 lr 0.000944 wd 0.0500 time 0.5817 (0.5869) data time 0.0006 (0.0021) model time 0.5811 (0.5855) loss 6.1250 (7.8476) grad_norm 1.5842 (2.1568) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:44:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [105/300][620/625] eta 0:00:02 lr 0.000944 wd 0.0500 time 0.5814 (0.5873) data time 0.0006 (0.0021) model time 0.5808 (0.5859) loss 8.2306 (7.8446) grad_norm 1.3562 (2.1490) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:44:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 105 training takes 0:06:06 +[2024-07-25 00:44:51 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 00:44:53 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 00:44:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.476 (0.476) Loss 0.5430 (0.5430) Acc@1 88.623 (88.623) Acc@5 98.193 (98.193) Mem 22339MB +[2024-07-25 00:44:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.8896 (0.6758) Acc@1 78.564 (85.170) Acc@5 95.557 (97.372) Mem 22339MB +[2024-07-25 00:44:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.9722 (0.8015) Acc@1 76.123 (81.745) Acc@5 94.873 (96.054) Mem 22339MB +[2024-07-25 00:44:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.372 Acc@5 96.047 +[2024-07-25 00:44:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.4% +[2024-07-25 00:44:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.805 (0.805) Loss 0.5151 (0.5151) Acc@1 89.355 (89.355) Acc@5 98.535 (98.535) Mem 22339MB +[2024-07-25 00:44:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.188) Loss 0.8315 (0.6503) Acc@1 80.078 (85.866) Acc@5 96.143 (97.674) Mem 22339MB +[2024-07-25 00:45:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.158) Loss 0.9434 (0.7635) Acc@1 75.977 (82.587) Acc@5 95.068 (96.468) Mem 22339MB +[2024-07-25 00:45:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.286 Acc@5 96.467 +[2024-07-25 00:45:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.3% +[2024-07-25 00:45:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][0/625] eta 0:14:14 lr 0.000944 wd 0.0500 time 1.3670 (1.3670) data time 0.5739 (0.5739) model time 0.0000 (0.0000) loss 9.2095 (9.2095) grad_norm 1.5587 (1.5587) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:45:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][10/625] eta 0:06:37 lr 0.000944 wd 0.0500 time 0.5766 (0.6471) data time 0.0006 (0.0529) model time 0.0000 (0.0000) loss 8.6041 (8.1835) grad_norm 2.1549 (2.0010) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:45:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][20/625] eta 0:06:11 lr 0.000944 wd 0.0500 time 0.5905 (0.6145) data time 0.0006 (0.0282) model time 0.0000 (0.0000) loss 7.7038 (8.2619) grad_norm 2.0791 (1.9444) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:45:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][30/625] eta 0:05:58 lr 0.000944 wd 0.0500 time 0.5817 (0.6023) data time 0.0008 (0.0194) model time 0.0000 (0.0000) loss 7.8134 (8.2101) grad_norm 2.0977 (2.0038) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:45:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][40/625] eta 0:05:48 lr 0.000944 wd 0.0500 time 0.5736 (0.5955) data time 0.0007 (0.0148) model time 0.0000 (0.0000) loss 8.5946 (8.2410) grad_norm 3.3481 (2.1583) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:45:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][50/625] eta 0:05:40 lr 0.000944 wd 0.0500 time 0.5781 (0.5923) data time 0.0008 (0.0121) model time 0.0000 (0.0000) loss 7.0459 (8.0605) grad_norm 1.9868 (2.1073) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:45:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][60/625] eta 0:05:33 lr 0.000944 wd 0.0500 time 0.5829 (0.5900) data time 0.0008 (0.0102) model time 0.5821 (0.5772) loss 7.2002 (8.1452) grad_norm 1.4369 (2.0550) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:45:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][70/625] eta 0:05:26 lr 0.000944 wd 0.0500 time 0.5798 (0.5883) data time 0.0006 (0.0089) model time 0.5792 (0.5773) loss 7.7437 (8.1152) grad_norm 2.0709 (2.0258) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:45:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][80/625] eta 0:05:19 lr 0.000944 wd 0.0500 time 0.5816 (0.5869) data time 0.0006 (0.0079) model time 0.5810 (0.5769) loss 7.8985 (8.0802) grad_norm 1.6307 (2.0739) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:45:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][90/625] eta 0:05:13 lr 0.000943 wd 0.0500 time 0.5796 (0.5859) data time 0.0006 (0.0071) model time 0.5790 (0.5769) loss 7.2580 (8.0550) grad_norm 2.1879 (2.0621) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:45:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][100/625] eta 0:05:07 lr 0.000943 wd 0.0500 time 0.5777 (0.5851) data time 0.0006 (0.0065) model time 0.5771 (0.5770) loss 6.1769 (8.0339) grad_norm 1.7270 (2.0672) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:46:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][110/625] eta 0:05:01 lr 0.000943 wd 0.0500 time 0.5871 (0.5845) data time 0.0007 (0.0060) model time 0.5864 (0.5770) loss 8.2469 (8.0239) grad_norm 2.4434 (2.0826) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:46:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][120/625] eta 0:04:54 lr 0.000943 wd 0.0500 time 0.5806 (0.5838) data time 0.0008 (0.0056) model time 0.5798 (0.5769) loss 9.1395 (8.0331) grad_norm 2.7442 (2.0771) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:46:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][130/625] eta 0:04:48 lr 0.000943 wd 0.0500 time 0.5785 (0.5833) data time 0.0008 (0.0052) model time 0.5777 (0.5768) loss 8.1121 (8.0063) grad_norm 1.3527 (2.1000) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:46:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][140/625] eta 0:04:42 lr 0.000943 wd 0.0500 time 0.5758 (0.5829) data time 0.0007 (0.0049) model time 0.5751 (0.5767) loss 7.8639 (7.9985) grad_norm 2.8261 (2.0991) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:46:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][150/625] eta 0:04:36 lr 0.000943 wd 0.0500 time 0.5891 (0.5827) data time 0.0008 (0.0046) model time 0.5882 (0.5769) loss 6.7251 (7.9725) grad_norm 2.0398 (2.0836) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:46:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][160/625] eta 0:04:30 lr 0.000943 wd 0.0500 time 0.5820 (0.5827) data time 0.0008 (0.0044) model time 0.5812 (0.5774) loss 7.3925 (7.9559) grad_norm 1.9145 (2.0928) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:46:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][170/625] eta 0:04:24 lr 0.000943 wd 0.0500 time 0.5791 (0.5824) data time 0.0006 (0.0042) model time 0.5785 (0.5773) loss 8.3789 (7.9740) grad_norm 3.3427 (2.1105) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:46:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][180/625] eta 0:04:19 lr 0.000943 wd 0.0500 time 0.5795 (0.5822) data time 0.0006 (0.0040) model time 0.5788 (0.5774) loss 8.1278 (7.9921) grad_norm 1.9681 (2.1077) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:46:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][190/625] eta 0:04:13 lr 0.000943 wd 0.0500 time 0.5811 (0.5819) data time 0.0006 (0.0038) model time 0.5806 (0.5773) loss 7.5240 (7.9813) grad_norm 2.2228 (2.1130) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:46:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][200/625] eta 0:04:09 lr 0.000943 wd 0.0500 time 0.7192 (0.5871) data time 0.0007 (0.0037) model time 0.7185 (0.5845) loss 8.3699 (8.0008) grad_norm 1.6980 (2.1367) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:47:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][210/625] eta 0:04:04 lr 0.000942 wd 0.0500 time 0.5847 (0.5882) data time 0.0008 (0.0036) model time 0.5839 (0.5861) loss 9.6483 (8.0106) grad_norm 2.2877 (2.1493) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:47:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][220/625] eta 0:03:58 lr 0.000942 wd 0.0500 time 0.5816 (0.5893) data time 0.0006 (0.0034) model time 0.5810 (0.5876) loss 7.4099 (7.9850) grad_norm 1.8480 (2.1464) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:47:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][230/625] eta 0:03:52 lr 0.000942 wd 0.0500 time 0.5825 (0.5887) data time 0.0006 (0.0033) model time 0.5819 (0.5869) loss 5.8830 (7.9804) grad_norm 1.7018 (2.1379) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:47:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][240/625] eta 0:03:46 lr 0.000942 wd 0.0500 time 0.5837 (0.5883) data time 0.0006 (0.0032) model time 0.5831 (0.5865) loss 7.3399 (7.9799) grad_norm 1.7283 (2.1331) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:47:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][250/625] eta 0:03:40 lr 0.000942 wd 0.0500 time 0.5800 (0.5879) data time 0.0006 (0.0031) model time 0.5794 (0.5860) loss 8.6203 (7.9856) grad_norm 1.8326 (2.1496) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:47:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][260/625] eta 0:03:34 lr 0.000942 wd 0.0500 time 0.5762 (0.5874) data time 0.0007 (0.0030) model time 0.5756 (0.5854) loss 7.7864 (7.9809) grad_norm 2.1819 (2.1418) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:47:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][270/625] eta 0:03:28 lr 0.000942 wd 0.0500 time 0.5841 (0.5870) data time 0.0008 (0.0030) model time 0.5833 (0.5850) loss 6.8113 (7.9702) grad_norm 1.7931 (2.1348) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:47:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][280/625] eta 0:03:22 lr 0.000942 wd 0.0500 time 0.5804 (0.5867) data time 0.0008 (0.0029) model time 0.5796 (0.5846) loss 8.4870 (7.9601) grad_norm 1.7014 (2.1303) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:47:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][290/625] eta 0:03:16 lr 0.000942 wd 0.0500 time 0.5816 (0.5863) data time 0.0009 (0.0028) model time 0.5807 (0.5842) loss 8.4743 (7.9653) grad_norm 1.9565 (2.1207) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:47:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][300/625] eta 0:03:10 lr 0.000942 wd 0.0500 time 0.5788 (0.5860) data time 0.0006 (0.0027) model time 0.5782 (0.5839) loss 9.0043 (7.9663) grad_norm 2.1677 (2.1183) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:48:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][310/625] eta 0:03:04 lr 0.000942 wd 0.0500 time 0.5823 (0.5858) data time 0.0006 (0.0027) model time 0.5817 (0.5837) loss 8.0690 (7.9735) grad_norm 2.0986 (2.1164) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:48:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][320/625] eta 0:02:58 lr 0.000941 wd 0.0500 time 0.5800 (0.5855) data time 0.0008 (0.0026) model time 0.5791 (0.5834) loss 8.3842 (7.9492) grad_norm 2.3275 (2.1304) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:48:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][330/625] eta 0:02:52 lr 0.000941 wd 0.0500 time 0.5835 (0.5853) data time 0.0008 (0.0026) model time 0.5827 (0.5831) loss 8.3573 (7.9488) grad_norm 2.0636 (2.1208) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:48:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][340/625] eta 0:02:46 lr 0.000941 wd 0.0500 time 0.5887 (0.5851) data time 0.0008 (0.0025) model time 0.5879 (0.5830) loss 7.9102 (7.9525) grad_norm 3.3038 (2.1396) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:48:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][350/625] eta 0:02:40 lr 0.000941 wd 0.0500 time 0.5810 (0.5849) data time 0.0008 (0.0025) model time 0.5802 (0.5828) loss 7.9030 (7.9515) grad_norm 2.1978 (2.1372) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:48:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][360/625] eta 0:02:34 lr 0.000941 wd 0.0500 time 0.5836 (0.5847) data time 0.0008 (0.0024) model time 0.5828 (0.5826) loss 8.7370 (7.9625) grad_norm 2.3066 (2.1336) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:48:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][370/625] eta 0:02:29 lr 0.000941 wd 0.0500 time 0.5845 (0.5846) data time 0.0007 (0.0024) model time 0.5838 (0.5825) loss 6.1915 (7.9581) grad_norm 1.7079 (2.1327) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:48:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][380/625] eta 0:02:23 lr 0.000941 wd 0.0500 time 0.5771 (0.5848) data time 0.0008 (0.0024) model time 0.5763 (0.5827) loss 9.3511 (7.9471) grad_norm 2.2097 (2.1329) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:48:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][390/625] eta 0:02:17 lr 0.000941 wd 0.0500 time 0.5801 (0.5846) data time 0.0007 (0.0023) model time 0.5794 (0.5826) loss 7.8820 (7.9331) grad_norm 1.5329 (2.1287) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:48:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][400/625] eta 0:02:11 lr 0.000941 wd 0.0500 time 0.5761 (0.5844) data time 0.0009 (0.0023) model time 0.5751 (0.5824) loss 6.4513 (7.9242) grad_norm 2.3252 (2.1435) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:49:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][410/625] eta 0:02:05 lr 0.000941 wd 0.0500 time 0.7532 (0.5847) data time 0.0008 (0.0023) model time 0.7524 (0.5827) loss 7.7171 (7.9238) grad_norm 1.5142 (2.1437) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:49:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][420/625] eta 0:02:00 lr 0.000941 wd 0.0500 time 0.7274 (0.5868) data time 0.0006 (0.0022) model time 0.7268 (0.5852) loss 7.9716 (7.9275) grad_norm 1.7614 (2.1420) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:49:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][430/625] eta 0:01:54 lr 0.000940 wd 0.0500 time 0.5814 (0.5874) data time 0.0008 (0.0022) model time 0.5806 (0.5858) loss 8.5119 (7.9172) grad_norm 1.6918 (2.1391) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:49:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][440/625] eta 0:01:48 lr 0.000940 wd 0.0500 time 0.5787 (0.5877) data time 0.0008 (0.0022) model time 0.5779 (0.5863) loss 8.4204 (7.9094) grad_norm 2.1758 (2.1344) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:49:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][450/625] eta 0:01:42 lr 0.000940 wd 0.0500 time 0.5817 (0.5875) data time 0.0008 (0.0021) model time 0.5809 (0.5860) loss 8.7535 (7.9050) grad_norm 1.9984 (2.1333) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:49:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][460/625] eta 0:01:36 lr 0.000940 wd 0.0500 time 0.5824 (0.5873) data time 0.0009 (0.0021) model time 0.5815 (0.5858) loss 6.9880 (7.8919) grad_norm 1.9455 (2.1303) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:49:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][470/625] eta 0:01:30 lr 0.000940 wd 0.0500 time 0.5783 (0.5870) data time 0.0008 (0.0021) model time 0.5775 (0.5855) loss 8.2202 (7.8969) grad_norm 1.6600 (2.1353) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:49:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][480/625] eta 0:01:25 lr 0.000940 wd 0.0500 time 0.5769 (0.5868) data time 0.0008 (0.0021) model time 0.5761 (0.5853) loss 6.8972 (7.8998) grad_norm 2.7897 (2.1381) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:49:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][490/625] eta 0:01:19 lr 0.000940 wd 0.0500 time 0.5800 (0.5866) data time 0.0007 (0.0020) model time 0.5794 (0.5851) loss 8.4980 (7.8985) grad_norm 1.7590 (2.1415) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:49:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][500/625] eta 0:01:13 lr 0.000940 wd 0.0500 time 0.5848 (0.5864) data time 0.0006 (0.0020) model time 0.5842 (0.5849) loss 6.7584 (7.8948) grad_norm 1.6780 (2.1340) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:50:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][510/625] eta 0:01:07 lr 0.000940 wd 0.0500 time 0.5811 (0.5862) data time 0.0006 (0.0020) model time 0.5805 (0.5847) loss 8.6543 (7.9026) grad_norm 1.9597 (2.1364) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:50:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][520/625] eta 0:01:01 lr 0.000940 wd 0.0500 time 0.5863 (0.5861) data time 0.0008 (0.0020) model time 0.5855 (0.5845) loss 9.3121 (7.8978) grad_norm 3.0648 (2.1393) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:50:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][530/625] eta 0:00:55 lr 0.000940 wd 0.0500 time 0.5810 (0.5859) data time 0.0009 (0.0019) model time 0.5801 (0.5843) loss 6.5584 (7.8925) grad_norm 2.6506 (2.1458) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:50:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][540/625] eta 0:00:49 lr 0.000940 wd 0.0500 time 0.5848 (0.5857) data time 0.0008 (0.0019) model time 0.5840 (0.5842) loss 8.5737 (7.8866) grad_norm 3.2063 (2.1451) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:50:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][550/625] eta 0:00:43 lr 0.000939 wd 0.0500 time 0.5827 (0.5856) data time 0.0006 (0.0019) model time 0.5821 (0.5840) loss 8.1413 (7.8947) grad_norm 2.2272 (2.1432) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:50:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][560/625] eta 0:00:38 lr 0.000939 wd 0.0500 time 0.5835 (0.5855) data time 0.0007 (0.0019) model time 0.5828 (0.5839) loss 7.3017 (7.8952) grad_norm 1.8909 (2.1443) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:50:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][570/625] eta 0:00:32 lr 0.000939 wd 0.0500 time 0.6188 (0.5854) data time 0.0006 (0.0019) model time 0.6182 (0.5838) loss 7.8708 (7.8899) grad_norm 3.3232 (2.1455) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:50:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][580/625] eta 0:00:26 lr 0.000939 wd 0.0500 time 0.5860 (0.5854) data time 0.0008 (0.0019) model time 0.5852 (0.5838) loss 9.0914 (7.8872) grad_norm 2.3418 (2.1435) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:50:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][590/625] eta 0:00:20 lr 0.000939 wd 0.0500 time 0.5816 (0.5852) data time 0.0006 (0.0018) model time 0.5810 (0.5837) loss 6.6262 (7.8834) grad_norm 1.8406 (2.1446) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:50:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][600/625] eta 0:00:14 lr 0.000939 wd 0.0500 time 0.5807 (0.5853) data time 0.0006 (0.0018) model time 0.5801 (0.5838) loss 7.3823 (7.8763) grad_norm 2.0975 (2.1461) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:50:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][610/625] eta 0:00:08 lr 0.000939 wd 0.0500 time 0.5863 (0.5852) data time 0.0006 (0.0018) model time 0.5857 (0.5836) loss 6.6418 (7.8766) grad_norm 2.4197 (2.1565) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:51:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [106/300][620/625] eta 0:00:02 lr 0.000939 wd 0.0500 time 0.5799 (0.5850) data time 0.0004 (0.0018) model time 0.5795 (0.5835) loss 7.1471 (7.8679) grad_norm 1.9877 (2.1647) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:51:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 106 training takes 0:06:05 +[2024-07-25 00:51:06 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 00:51:07 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 00:51:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.466 (0.466) Loss 0.5103 (0.5103) Acc@1 89.062 (89.062) Acc@5 98.389 (98.389) Mem 22339MB +[2024-07-25 00:51:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.8662 (0.6596) Acc@1 79.541 (85.458) Acc@5 95.654 (97.421) Mem 22339MB +[2024-07-25 00:51:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.9683 (0.7904) Acc@1 75.537 (82.022) Acc@5 93.896 (96.057) Mem 22339MB +[2024-07-25 00:51:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.736 Acc@5 96.073 +[2024-07-25 00:51:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.7% +[2024-07-25 00:51:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 81.74% +[2024-07-25 00:51:11 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 00:51:12 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 00:51:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.470 (0.470) Loss 0.5142 (0.5142) Acc@1 89.404 (89.404) Acc@5 98.584 (98.584) Mem 22339MB +[2024-07-25 00:51:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.8306 (0.6493) Acc@1 80.225 (85.902) Acc@5 95.996 (97.661) Mem 22339MB +[2024-07-25 00:51:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9424 (0.7623) Acc@1 76.074 (82.624) Acc@5 95.117 (96.470) Mem 22339MB +[2024-07-25 00:51:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.326 Acc@5 96.465 +[2024-07-25 00:51:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.3% +[2024-07-25 00:51:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.33% +[2024-07-25 00:51:15 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 00:51:17 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 00:51:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][0/625] eta 0:09:29 lr 0.000939 wd 0.0500 time 0.9117 (0.9117) data time 0.3944 (0.3944) model time 0.0000 (0.0000) loss 7.3424 (7.3424) grad_norm 2.0906 (2.0906) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:51:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][10/625] eta 0:06:35 lr 0.000939 wd 0.0500 time 0.6998 (0.6433) data time 0.0006 (0.0365) model time 0.0000 (0.0000) loss 8.7567 (8.1599) grad_norm 2.6787 (2.1107) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:51:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][20/625] eta 0:06:32 lr 0.000939 wd 0.0500 time 0.7265 (0.6484) data time 0.0006 (0.0196) model time 0.0000 (0.0000) loss 6.3880 (7.9424) grad_norm 1.6246 (2.0751) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:51:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][30/625] eta 0:06:18 lr 0.000939 wd 0.0500 time 0.7423 (0.6369) data time 0.0008 (0.0136) model time 0.0000 (0.0000) loss 8.5280 (7.8805) grad_norm 3.7586 (2.2209) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:51:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][40/625] eta 0:06:03 lr 0.000938 wd 0.0500 time 0.5666 (0.6219) data time 0.0006 (0.0105) model time 0.0000 (0.0000) loss 6.9140 (7.7636) grad_norm 1.4088 (2.1945) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:51:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][50/625] eta 0:05:52 lr 0.000938 wd 0.0500 time 0.5607 (0.6134) data time 0.0009 (0.0086) model time 0.0000 (0.0000) loss 7.4233 (7.8157) grad_norm 2.8373 (2.1551) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:51:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][60/625] eta 0:05:43 lr 0.000938 wd 0.0500 time 0.5628 (0.6073) data time 0.0006 (0.0073) model time 0.5621 (0.5756) loss 8.5655 (7.9091) grad_norm 2.5401 (2.1823) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:52:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][70/625] eta 0:05:34 lr 0.000938 wd 0.0500 time 0.5681 (0.6030) data time 0.0007 (0.0064) model time 0.5675 (0.5755) loss 8.8876 (7.9057) grad_norm 2.8639 (2.2697) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:52:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][80/625] eta 0:05:27 lr 0.000938 wd 0.0500 time 0.5656 (0.6003) data time 0.0008 (0.0057) model time 0.5648 (0.5772) loss 9.4108 (7.9361) grad_norm 1.4258 (2.2187) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:52:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][90/625] eta 0:05:20 lr 0.000938 wd 0.0500 time 0.5672 (0.5985) data time 0.0008 (0.0052) model time 0.5664 (0.5786) loss 7.0450 (7.9311) grad_norm 3.0945 (2.1919) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:52:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][100/625] eta 0:05:13 lr 0.000938 wd 0.0500 time 0.5661 (0.5965) data time 0.0007 (0.0048) model time 0.5655 (0.5784) loss 5.8417 (7.8485) grad_norm 2.4344 (2.1962) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:52:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][110/625] eta 0:05:06 lr 0.000938 wd 0.0500 time 0.5644 (0.5949) data time 0.0007 (0.0044) model time 0.5637 (0.5783) loss 7.9090 (7.8606) grad_norm 3.0359 (2.1916) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:52:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][120/625] eta 0:04:59 lr 0.000938 wd 0.0500 time 0.5665 (0.5935) data time 0.0008 (0.0041) model time 0.5657 (0.5782) loss 8.3402 (7.8514) grad_norm 2.0939 (2.1963) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:52:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][130/625] eta 0:04:53 lr 0.000938 wd 0.0500 time 0.5640 (0.5924) data time 0.0006 (0.0038) model time 0.5633 (0.5781) loss 9.0037 (7.8644) grad_norm 2.0599 (2.1993) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:52:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][140/625] eta 0:04:46 lr 0.000938 wd 0.0500 time 0.5662 (0.5913) data time 0.0007 (0.0036) model time 0.5655 (0.5780) loss 8.6345 (7.8354) grad_norm 4.1564 (2.2650) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:52:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][150/625] eta 0:04:40 lr 0.000937 wd 0.0500 time 0.5660 (0.5903) data time 0.0006 (0.0034) model time 0.5654 (0.5778) loss 7.1602 (7.8489) grad_norm 2.5723 (2.3028) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:52:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][160/625] eta 0:04:34 lr 0.000937 wd 0.0500 time 0.5622 (0.5898) data time 0.0006 (0.0033) model time 0.5616 (0.5780) loss 8.3083 (7.8376) grad_norm 1.7060 (2.2871) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:52:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][170/625] eta 0:04:28 lr 0.000937 wd 0.0500 time 0.5675 (0.5891) data time 0.0006 (0.0032) model time 0.5669 (0.5779) loss 7.0139 (7.8355) grad_norm 1.9541 (2.2556) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:53:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][180/625] eta 0:04:21 lr 0.000937 wd 0.0500 time 0.5701 (0.5885) data time 0.0006 (0.0030) model time 0.5695 (0.5779) loss 6.2271 (7.8113) grad_norm 2.1856 (2.2313) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:53:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][190/625] eta 0:04:15 lr 0.000937 wd 0.0500 time 0.5614 (0.5880) data time 0.0008 (0.0029) model time 0.5606 (0.5779) loss 6.2276 (7.8079) grad_norm 1.6995 (2.2218) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:53:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][200/625] eta 0:04:09 lr 0.000937 wd 0.0500 time 0.5621 (0.5877) data time 0.0006 (0.0029) model time 0.5614 (0.5781) loss 7.8312 (7.7856) grad_norm 2.2115 (2.2104) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:53:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][210/625] eta 0:04:03 lr 0.000937 wd 0.0500 time 0.5607 (0.5874) data time 0.0006 (0.0028) model time 0.5602 (0.5783) loss 7.4563 (7.7810) grad_norm 1.8172 (2.2130) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:53:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][220/625] eta 0:03:57 lr 0.000937 wd 0.0500 time 0.5637 (0.5872) data time 0.0008 (0.0027) model time 0.5629 (0.5785) loss 7.2970 (7.7820) grad_norm 1.8547 (2.2178) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:53:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][230/625] eta 0:03:52 lr 0.000937 wd 0.0500 time 0.7421 (0.5894) data time 0.0007 (0.0026) model time 0.7414 (0.5817) loss 7.5953 (7.7920) grad_norm 1.4656 (2.2004) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:53:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][240/625] eta 0:03:48 lr 0.000937 wd 0.0500 time 0.7366 (0.5923) data time 0.0006 (0.0025) model time 0.7360 (0.5858) loss 7.4174 (7.8037) grad_norm 3.3494 (2.1970) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:53:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][250/625] eta 0:03:42 lr 0.000937 wd 0.0500 time 0.5691 (0.5923) data time 0.0008 (0.0025) model time 0.5683 (0.5860) loss 7.3146 (7.8122) grad_norm 1.9002 (2.1950) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:53:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][260/625] eta 0:03:36 lr 0.000936 wd 0.0500 time 0.5715 (0.5923) data time 0.0008 (0.0024) model time 0.5707 (0.5863) loss 9.4880 (7.8144) grad_norm 2.4884 (2.2024) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:53:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][270/625] eta 0:03:30 lr 0.000936 wd 0.0500 time 0.5646 (0.5918) data time 0.0007 (0.0024) model time 0.5639 (0.5859) loss 6.7321 (7.8045) grad_norm 1.7487 (2.1915) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:54:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][280/625] eta 0:03:23 lr 0.000936 wd 0.0500 time 0.5663 (0.5913) data time 0.0009 (0.0023) model time 0.5655 (0.5854) loss 8.5025 (7.8060) grad_norm 2.3869 (2.1990) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:54:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][290/625] eta 0:03:17 lr 0.000936 wd 0.0500 time 0.5672 (0.5908) data time 0.0006 (0.0023) model time 0.5665 (0.5851) loss 8.2080 (7.8071) grad_norm 2.1128 (2.2084) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:54:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][300/625] eta 0:03:11 lr 0.000936 wd 0.0500 time 0.5698 (0.5904) data time 0.0006 (0.0022) model time 0.5692 (0.5848) loss 8.6728 (7.7874) grad_norm 2.3504 (2.2013) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:54:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][310/625] eta 0:03:06 lr 0.000936 wd 0.0500 time 0.5694 (0.5907) data time 0.0008 (0.0022) model time 0.5686 (0.5853) loss 8.9886 (7.7873) grad_norm 1.4373 (2.2070) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 00:54:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][320/625] eta 0:03:00 lr 0.000936 wd 0.0500 time 0.5624 (0.5903) data time 0.0009 (0.0021) model time 0.5615 (0.5850) loss 8.1136 (7.7844) grad_norm 1.8302 (2.1977) loss_scale 4096.0000 (2105.4206) mem 22339MB +[2024-07-25 00:54:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][330/625] eta 0:02:54 lr 0.000936 wd 0.0500 time 0.5678 (0.5899) data time 0.0008 (0.0021) model time 0.5670 (0.5847) loss 6.7062 (7.7815) grad_norm 1.8629 (2.1914) loss_scale 4096.0000 (2165.5589) mem 22339MB +[2024-07-25 00:54:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][340/625] eta 0:02:48 lr 0.000936 wd 0.0500 time 0.5641 (0.5896) data time 0.0007 (0.0021) model time 0.5635 (0.5845) loss 8.1387 (7.7940) grad_norm 2.2738 (2.1916) loss_scale 4096.0000 (2222.1701) mem 22339MB +[2024-07-25 00:54:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][350/625] eta 0:02:42 lr 0.000936 wd 0.0500 time 0.5676 (0.5892) data time 0.0008 (0.0020) model time 0.5668 (0.5842) loss 8.0221 (7.7967) grad_norm 2.2625 (2.1861) loss_scale 4096.0000 (2275.5556) mem 22339MB +[2024-07-25 00:54:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][360/625] eta 0:02:36 lr 0.000936 wd 0.0500 time 0.5646 (0.5889) data time 0.0006 (0.0020) model time 0.5640 (0.5840) loss 8.4366 (7.8007) grad_norm 1.8608 (2.1844) loss_scale 4096.0000 (2325.9834) mem 22339MB +[2024-07-25 00:54:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][370/625] eta 0:02:30 lr 0.000935 wd 0.0500 time 0.5618 (0.5886) data time 0.0008 (0.0020) model time 0.5610 (0.5837) loss 8.2240 (7.7993) grad_norm 1.5746 (2.1844) loss_scale 4096.0000 (2373.6927) mem 22339MB +[2024-07-25 00:55:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][380/625] eta 0:02:24 lr 0.000935 wd 0.0500 time 0.5681 (0.5883) data time 0.0008 (0.0019) model time 0.5673 (0.5835) loss 7.7018 (7.8141) grad_norm 1.8705 (2.1861) loss_scale 4096.0000 (2418.8976) mem 22339MB +[2024-07-25 00:55:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][390/625] eta 0:02:18 lr 0.000935 wd 0.0500 time 0.5653 (0.5880) data time 0.0006 (0.0019) model time 0.5646 (0.5833) loss 7.0642 (7.8243) grad_norm 1.6690 (2.1746) loss_scale 4096.0000 (2461.7903) mem 22339MB +[2024-07-25 00:55:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][400/625] eta 0:02:12 lr 0.000935 wd 0.0500 time 0.5697 (0.5878) data time 0.0008 (0.0019) model time 0.5689 (0.5831) loss 7.8205 (7.8364) grad_norm 2.5866 (2.1657) loss_scale 4096.0000 (2502.5436) mem 22339MB +[2024-07-25 00:55:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][410/625] eta 0:02:06 lr 0.000935 wd 0.0500 time 0.5714 (0.5876) data time 0.0007 (0.0019) model time 0.5707 (0.5830) loss 7.4220 (7.8310) grad_norm 2.3199 (2.1713) loss_scale 4096.0000 (2541.3139) mem 22339MB +[2024-07-25 00:55:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][420/625] eta 0:02:00 lr 0.000935 wd 0.0500 time 0.5632 (0.5873) data time 0.0006 (0.0018) model time 0.5626 (0.5828) loss 8.5314 (7.8384) grad_norm 3.1389 (2.1874) loss_scale 4096.0000 (2578.2423) mem 22339MB +[2024-07-25 00:55:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][430/625] eta 0:01:54 lr 0.000935 wd 0.0500 time 0.5656 (0.5871) data time 0.0008 (0.0018) model time 0.5647 (0.5826) loss 7.6588 (7.8270) grad_norm 2.3666 (2.1900) loss_scale 4096.0000 (2613.4571) mem 22339MB +[2024-07-25 00:55:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][440/625] eta 0:01:48 lr 0.000935 wd 0.0500 time 0.5664 (0.5869) data time 0.0006 (0.0018) model time 0.5658 (0.5825) loss 7.6608 (7.8335) grad_norm 2.1965 (2.1900) loss_scale 4096.0000 (2647.0748) mem 22339MB +[2024-07-25 00:55:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][450/625] eta 0:01:42 lr 0.000935 wd 0.0500 time 0.7305 (0.5874) data time 0.0008 (0.0018) model time 0.7297 (0.5831) loss 6.4352 (7.8386) grad_norm 1.9542 (2.1900) loss_scale 4096.0000 (2679.2018) mem 22339MB +[2024-07-25 00:55:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][460/625] eta 0:01:37 lr 0.000935 wd 0.0500 time 0.5656 (0.5895) data time 0.0006 (0.0018) model time 0.5650 (0.5856) loss 6.5731 (7.8371) grad_norm 1.5564 (2.1903) loss_scale 4096.0000 (2709.9349) mem 22339MB +[2024-07-25 00:55:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][470/625] eta 0:01:31 lr 0.000935 wd 0.0500 time 0.5641 (0.5898) data time 0.0008 (0.0017) model time 0.5633 (0.5860) loss 8.3875 (7.8441) grad_norm 1.8693 (2.1874) loss_scale 4096.0000 (2739.3631) mem 22339MB +[2024-07-25 00:56:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][480/625] eta 0:01:25 lr 0.000935 wd 0.0500 time 0.5697 (0.5902) data time 0.0008 (0.0017) model time 0.5689 (0.5866) loss 5.4924 (7.8388) grad_norm 2.2917 (2.1830) loss_scale 4096.0000 (2767.5676) mem 22339MB +[2024-07-25 00:56:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][490/625] eta 0:01:19 lr 0.000934 wd 0.0500 time 0.5620 (0.5899) data time 0.0008 (0.0017) model time 0.5612 (0.5863) loss 7.6134 (7.8420) grad_norm 3.3124 (2.1864) loss_scale 4096.0000 (2794.6232) mem 22339MB +[2024-07-25 00:56:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][500/625] eta 0:01:13 lr 0.000934 wd 0.0500 time 0.5635 (0.5897) data time 0.0008 (0.0017) model time 0.5627 (0.5861) loss 7.6204 (7.8425) grad_norm 2.5742 (2.1933) loss_scale 4096.0000 (2820.5988) mem 22339MB +[2024-07-25 00:56:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][510/625] eta 0:01:07 lr 0.000934 wd 0.0500 time 0.5682 (0.5894) data time 0.0009 (0.0017) model time 0.5673 (0.5858) loss 6.5860 (7.8403) grad_norm 1.5994 (2.1884) loss_scale 4096.0000 (2845.5577) mem 22339MB +[2024-07-25 00:56:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][520/625] eta 0:01:01 lr 0.000934 wd 0.0500 time 0.5647 (0.5892) data time 0.0006 (0.0017) model time 0.5641 (0.5856) loss 9.0280 (7.8530) grad_norm 1.9222 (2.1925) loss_scale 4096.0000 (2869.5585) mem 22339MB +[2024-07-25 00:56:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][530/625] eta 0:00:55 lr 0.000934 wd 0.0500 time 0.7033 (0.5892) data time 0.0007 (0.0016) model time 0.7026 (0.5857) loss 8.1967 (7.8534) grad_norm 2.2967 (2.2059) loss_scale 4096.0000 (2892.6554) mem 22339MB +[2024-07-25 00:56:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][540/625] eta 0:00:50 lr 0.000934 wd 0.0500 time 0.5617 (0.5890) data time 0.0006 (0.0016) model time 0.5611 (0.5856) loss 9.1266 (7.8542) grad_norm 1.6862 (2.2070) loss_scale 4096.0000 (2914.8983) mem 22339MB +[2024-07-25 00:56:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][550/625] eta 0:00:44 lr 0.000934 wd 0.0500 time 0.5687 (0.5888) data time 0.0008 (0.0016) model time 0.5679 (0.5854) loss 8.4546 (7.8427) grad_norm 1.7179 (2.2063) loss_scale 4096.0000 (2936.3339) mem 22339MB +[2024-07-25 00:56:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][560/625] eta 0:00:38 lr 0.000934 wd 0.0500 time 0.5669 (0.5886) data time 0.0006 (0.0016) model time 0.5663 (0.5852) loss 5.8789 (7.8433) grad_norm 1.8287 (2.2053) loss_scale 4096.0000 (2957.0053) mem 22339MB +[2024-07-25 00:56:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][570/625] eta 0:00:32 lr 0.000934 wd 0.0500 time 0.5690 (0.5884) data time 0.0006 (0.0016) model time 0.5685 (0.5851) loss 9.7024 (7.8463) grad_norm 1.6138 (2.2031) loss_scale 4096.0000 (2976.9527) mem 22339MB +[2024-07-25 00:56:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][580/625] eta 0:00:26 lr 0.000934 wd 0.0500 time 0.5689 (0.5882) data time 0.0008 (0.0016) model time 0.5681 (0.5849) loss 7.7158 (7.8514) grad_norm 3.9701 (2.2045) loss_scale 4096.0000 (2996.2134) mem 22339MB +[2024-07-25 00:57:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][590/625] eta 0:00:20 lr 0.000934 wd 0.0500 time 0.5635 (0.5881) data time 0.0006 (0.0016) model time 0.5629 (0.5848) loss 7.0779 (7.8520) grad_norm 2.4521 (2.2078) loss_scale 4096.0000 (3014.8223) mem 22339MB +[2024-07-25 00:57:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][600/625] eta 0:00:14 lr 0.000933 wd 0.0500 time 0.5648 (0.5879) data time 0.0006 (0.0015) model time 0.5642 (0.5846) loss 8.7814 (7.8435) grad_norm 2.7741 (2.2062) loss_scale 4096.0000 (3032.8120) mem 22339MB +[2024-07-25 00:57:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][610/625] eta 0:00:08 lr 0.000933 wd 0.0500 time 0.5619 (0.5877) data time 0.0006 (0.0015) model time 0.5613 (0.5845) loss 7.2828 (7.8440) grad_norm 3.3694 (2.2142) loss_scale 4096.0000 (3050.2128) mem 22339MB +[2024-07-25 00:57:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [107/300][620/625] eta 0:00:02 lr 0.000933 wd 0.0500 time 0.5713 (0.5876) data time 0.0006 (0.0015) model time 0.5707 (0.5843) loss 7.7621 (7.8394) grad_norm 1.8877 (2.2108) loss_scale 4096.0000 (3067.0531) mem 22339MB +[2024-07-25 00:57:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 107 training takes 0:06:07 +[2024-07-25 00:57:24 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 00:57:26 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 00:57:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.468 (0.468) Loss 0.5649 (0.5649) Acc@1 87.891 (87.891) Acc@5 98.438 (98.438) Mem 22339MB +[2024-07-25 00:57:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.9146 (0.6909) Acc@1 79.199 (85.121) Acc@5 95.117 (97.372) Mem 22339MB +[2024-07-25 00:57:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.9834 (0.8105) Acc@1 75.977 (81.676) Acc@5 94.238 (96.043) Mem 22339MB +[2024-07-25 00:57:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.364 Acc@5 96.025 +[2024-07-25 00:57:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.4% +[2024-07-25 00:57:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.899 (0.899) Loss 0.5127 (0.5127) Acc@1 89.453 (89.453) Acc@5 98.584 (98.584) Mem 22339MB +[2024-07-25 00:57:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.196) Loss 0.8286 (0.6480) Acc@1 80.371 (85.951) Acc@5 95.996 (97.647) Mem 22339MB +[2024-07-25 00:57:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.162) Loss 0.9399 (0.7610) Acc@1 76.074 (82.629) Acc@5 95.068 (96.447) Mem 22339MB +[2024-07-25 00:57:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.332 Acc@5 96.445 +[2024-07-25 00:57:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.3% +[2024-07-25 00:57:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.33% +[2024-07-25 00:57:33 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 00:57:34 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 00:57:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][0/625] eta 0:09:08 lr 0.000933 wd 0.0500 time 0.8773 (0.8773) data time 0.3547 (0.3547) model time 0.0000 (0.0000) loss 8.1109 (8.1109) grad_norm 2.4540 (2.4540) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:57:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][10/625] eta 0:06:10 lr 0.000933 wd 0.0500 time 0.5691 (0.6023) data time 0.0006 (0.0330) model time 0.0000 (0.0000) loss 7.7816 (8.1032) grad_norm 1.4354 (1.9629) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:57:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][20/625] eta 0:05:57 lr 0.000933 wd 0.0500 time 0.5681 (0.5903) data time 0.0008 (0.0177) model time 0.0000 (0.0000) loss 8.0630 (7.9606) grad_norm 1.7977 (1.9112) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:57:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][30/625] eta 0:05:48 lr 0.000933 wd 0.0500 time 0.5621 (0.5858) data time 0.0008 (0.0124) model time 0.0000 (0.0000) loss 6.4879 (7.7668) grad_norm 1.8397 (1.9883) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:57:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][40/625] eta 0:05:44 lr 0.000933 wd 0.0500 time 0.7585 (0.5884) data time 0.0006 (0.0097) model time 0.0000 (0.0000) loss 7.8775 (7.9086) grad_norm 1.4929 (1.9651) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:58:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][50/625] eta 0:05:46 lr 0.000933 wd 0.0500 time 0.7212 (0.6033) data time 0.0006 (0.0080) model time 0.0000 (0.0000) loss 7.5122 (7.8724) grad_norm 2.2409 (1.9800) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:58:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][60/625] eta 0:05:42 lr 0.000933 wd 0.0500 time 0.7605 (0.6063) data time 0.0008 (0.0068) model time 0.7597 (0.6203) loss 7.4474 (7.8954) grad_norm 3.0625 (2.0028) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:58:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][70/625] eta 0:05:36 lr 0.000933 wd 0.0500 time 0.7491 (0.6056) data time 0.0006 (0.0060) model time 0.7485 (0.6104) loss 8.2405 (7.8658) grad_norm 1.9714 (2.0068) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:58:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][80/625] eta 0:05:28 lr 0.000933 wd 0.0500 time 0.5658 (0.6021) data time 0.0006 (0.0053) model time 0.5651 (0.5992) loss 6.9490 (7.8908) grad_norm 1.6485 (2.0336) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:58:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][90/625] eta 0:05:20 lr 0.000932 wd 0.0500 time 0.5668 (0.5993) data time 0.0006 (0.0048) model time 0.5662 (0.5934) loss 8.4583 (7.8702) grad_norm 2.2260 (2.0656) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:58:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][100/625] eta 0:05:13 lr 0.000932 wd 0.0500 time 0.5628 (0.5971) data time 0.0010 (0.0045) model time 0.5618 (0.5898) loss 10.1577 (7.9364) grad_norm 1.8459 (2.0518) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:58:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][110/625] eta 0:05:06 lr 0.000932 wd 0.0500 time 0.5622 (0.5952) data time 0.0006 (0.0041) model time 0.5616 (0.5875) loss 7.5771 (7.9183) grad_norm 2.5884 (2.0643) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:58:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][120/625] eta 0:04:59 lr 0.000932 wd 0.0500 time 0.5719 (0.5938) data time 0.0006 (0.0039) model time 0.5713 (0.5860) loss 5.5246 (7.8835) grad_norm 1.8206 (2.0615) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:58:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][130/625] eta 0:04:53 lr 0.000932 wd 0.0500 time 0.5704 (0.5927) data time 0.0008 (0.0036) model time 0.5696 (0.5850) loss 6.8653 (7.8838) grad_norm 2.0283 (2.0791) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:58:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][140/625] eta 0:04:46 lr 0.000932 wd 0.0500 time 0.5710 (0.5917) data time 0.0008 (0.0034) model time 0.5702 (0.5843) loss 8.2294 (7.8643) grad_norm 2.2858 (2.0863) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:59:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][150/625] eta 0:04:40 lr 0.000932 wd 0.0500 time 0.5681 (0.5908) data time 0.0008 (0.0032) model time 0.5673 (0.5836) loss 8.5951 (7.8821) grad_norm 2.8119 (2.0865) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:59:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][160/625] eta 0:04:34 lr 0.000932 wd 0.0500 time 0.5647 (0.5899) data time 0.0008 (0.0031) model time 0.5639 (0.5829) loss 8.6378 (7.8485) grad_norm 3.0065 (2.0986) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:59:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][170/625] eta 0:04:28 lr 0.000932 wd 0.0500 time 0.5680 (0.5892) data time 0.0008 (0.0030) model time 0.5672 (0.5824) loss 8.5926 (7.8773) grad_norm 1.6603 (2.0989) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:59:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][180/625] eta 0:04:21 lr 0.000932 wd 0.0500 time 0.5716 (0.5886) data time 0.0006 (0.0028) model time 0.5710 (0.5820) loss 7.6889 (7.8699) grad_norm 2.8585 (2.0912) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:59:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][190/625] eta 0:04:15 lr 0.000932 wd 0.0500 time 0.5636 (0.5880) data time 0.0006 (0.0027) model time 0.5630 (0.5816) loss 6.6401 (7.8670) grad_norm 1.5235 (2.0835) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:59:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][200/625] eta 0:04:09 lr 0.000931 wd 0.0500 time 0.5641 (0.5875) data time 0.0006 (0.0026) model time 0.5635 (0.5812) loss 8.1643 (7.8836) grad_norm 2.1188 (2.1001) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:59:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][210/625] eta 0:04:03 lr 0.000931 wd 0.0500 time 0.5642 (0.5870) data time 0.0006 (0.0026) model time 0.5636 (0.5809) loss 6.9143 (7.8670) grad_norm 1.5411 (2.0920) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:59:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][220/625] eta 0:03:57 lr 0.000931 wd 0.0500 time 0.5643 (0.5865) data time 0.0008 (0.0025) model time 0.5635 (0.5806) loss 8.3657 (7.8818) grad_norm 2.4982 (2.0937) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:59:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][230/625] eta 0:03:51 lr 0.000931 wd 0.0500 time 0.5641 (0.5861) data time 0.0008 (0.0024) model time 0.5633 (0.5804) loss 6.9796 (7.8732) grad_norm 2.4256 (2.1064) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 00:59:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][240/625] eta 0:03:45 lr 0.000931 wd 0.0500 time 0.5646 (0.5858) data time 0.0009 (0.0024) model time 0.5636 (0.5802) loss 6.8176 (7.8929) grad_norm 1.8186 (2.1228) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:00:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][250/625] eta 0:03:39 lr 0.000931 wd 0.0500 time 0.5655 (0.5854) data time 0.0006 (0.0023) model time 0.5649 (0.5800) loss 6.8645 (7.8996) grad_norm 2.3974 (2.1480) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:00:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][260/625] eta 0:03:33 lr 0.000931 wd 0.0500 time 0.5650 (0.5852) data time 0.0008 (0.0023) model time 0.5642 (0.5799) loss 8.9057 (7.9232) grad_norm 2.2688 (2.1686) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:00:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][270/625] eta 0:03:28 lr 0.000931 wd 0.0500 time 0.7246 (0.5877) data time 0.0006 (0.0022) model time 0.7240 (0.5832) loss 7.0229 (7.9149) grad_norm 3.8849 (2.1776) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:00:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][280/625] eta 0:03:23 lr 0.000931 wd 0.0500 time 0.5214 (0.5891) data time 0.0006 (0.0021) model time 0.5208 (0.5851) loss 6.0095 (7.8957) grad_norm 1.9786 (2.1783) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:00:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][290/625] eta 0:03:17 lr 0.000931 wd 0.0500 time 0.5634 (0.5892) data time 0.0008 (0.0021) model time 0.5626 (0.5854) loss 7.9807 (7.9207) grad_norm 2.2202 (2.1734) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:00:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][300/625] eta 0:03:11 lr 0.000931 wd 0.0500 time 0.5625 (0.5888) data time 0.0008 (0.0021) model time 0.5618 (0.5850) loss 7.5464 (7.9204) grad_norm 1.8405 (2.1705) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:00:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][310/625] eta 0:03:05 lr 0.000930 wd 0.0500 time 0.5658 (0.5885) data time 0.0006 (0.0020) model time 0.5652 (0.5847) loss 7.7401 (7.9029) grad_norm 3.5072 (2.1807) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:00:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][320/625] eta 0:02:59 lr 0.000930 wd 0.0500 time 0.5636 (0.5881) data time 0.0006 (0.0020) model time 0.5631 (0.5844) loss 8.6933 (7.9094) grad_norm 1.9319 (2.1827) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:00:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][330/625] eta 0:02:53 lr 0.000930 wd 0.0500 time 0.5696 (0.5878) data time 0.0008 (0.0019) model time 0.5688 (0.5841) loss 6.9827 (7.8950) grad_norm 1.9234 (2.1766) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:00:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][340/625] eta 0:02:47 lr 0.000930 wd 0.0500 time 0.5674 (0.5875) data time 0.0008 (0.0019) model time 0.5666 (0.5839) loss 7.6442 (7.8863) grad_norm 1.9921 (2.1748) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:01:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][350/625] eta 0:02:41 lr 0.000930 wd 0.0500 time 0.5681 (0.5873) data time 0.0007 (0.0019) model time 0.5675 (0.5837) loss 6.0633 (7.8746) grad_norm 1.7366 (2.1764) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:01:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][360/625] eta 0:02:35 lr 0.000930 wd 0.0500 time 0.5661 (0.5870) data time 0.0006 (0.0019) model time 0.5655 (0.5834) loss 8.6183 (7.8814) grad_norm 2.1568 (2.1921) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:01:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][370/625] eta 0:02:29 lr 0.000930 wd 0.0500 time 0.5645 (0.5868) data time 0.0007 (0.0018) model time 0.5638 (0.5833) loss 6.6576 (7.8673) grad_norm 2.3004 (2.1948) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:01:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][380/625] eta 0:02:23 lr 0.000930 wd 0.0500 time 0.5685 (0.5866) data time 0.0009 (0.0018) model time 0.5676 (0.5831) loss 9.3661 (7.8572) grad_norm 1.9173 (2.1885) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:01:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][390/625] eta 0:02:17 lr 0.000930 wd 0.0500 time 0.5630 (0.5864) data time 0.0008 (0.0018) model time 0.5622 (0.5830) loss 8.4684 (7.8604) grad_norm 2.8929 (2.1920) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:01:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][400/625] eta 0:02:11 lr 0.000930 wd 0.0500 time 0.5618 (0.5862) data time 0.0006 (0.0018) model time 0.5613 (0.5828) loss 6.8271 (7.8567) grad_norm 2.8238 (2.1992) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:01:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][410/625] eta 0:02:05 lr 0.000930 wd 0.0500 time 0.5659 (0.5860) data time 0.0006 (0.0017) model time 0.5652 (0.5827) loss 9.1576 (7.8616) grad_norm 2.0361 (2.2032) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:01:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][420/625] eta 0:02:00 lr 0.000929 wd 0.0500 time 0.5724 (0.5858) data time 0.0006 (0.0017) model time 0.5719 (0.5825) loss 8.4564 (7.8516) grad_norm 1.7745 (2.1996) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:01:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][430/625] eta 0:01:54 lr 0.000929 wd 0.0500 time 0.5753 (0.5856) data time 0.0007 (0.0017) model time 0.5745 (0.5824) loss 9.3880 (7.8569) grad_norm 2.0154 (2.2007) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:01:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][440/625] eta 0:01:48 lr 0.000929 wd 0.0500 time 0.5636 (0.5854) data time 0.0006 (0.0017) model time 0.5630 (0.5822) loss 7.9178 (7.8495) grad_norm 1.9006 (2.1932) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:01:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][450/625] eta 0:01:42 lr 0.000929 wd 0.0500 time 0.5640 (0.5852) data time 0.0007 (0.0016) model time 0.5632 (0.5820) loss 8.8498 (7.8543) grad_norm 1.6290 (2.1835) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:02:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][460/625] eta 0:01:36 lr 0.000929 wd 0.0500 time 0.5639 (0.5850) data time 0.0008 (0.0016) model time 0.5631 (0.5819) loss 8.1518 (7.8587) grad_norm 2.8047 (2.1798) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:02:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][470/625] eta 0:01:30 lr 0.000929 wd 0.0500 time 0.5705 (0.5849) data time 0.0008 (0.0016) model time 0.5697 (0.5818) loss 7.2952 (7.8446) grad_norm 2.2612 (2.1813) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:02:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][480/625] eta 0:01:24 lr 0.000929 wd 0.0500 time 0.5710 (0.5847) data time 0.0008 (0.0016) model time 0.5701 (0.5817) loss 7.3619 (7.8411) grad_norm 1.5715 (2.1746) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:02:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][490/625] eta 0:01:19 lr 0.000929 wd 0.0500 time 0.5647 (0.5865) data time 0.0006 (0.0016) model time 0.5641 (0.5837) loss 7.5694 (7.8369) grad_norm 1.7412 (2.1692) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:02:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][500/625] eta 0:01:13 lr 0.000929 wd 0.0500 time 0.6765 (0.5873) data time 0.0008 (0.0016) model time 0.6757 (0.5846) loss 7.0801 (7.8306) grad_norm 1.7923 (2.1662) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:02:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][510/625] eta 0:01:07 lr 0.000929 wd 0.0500 time 0.7559 (0.5879) data time 0.0009 (0.0015) model time 0.7551 (0.5853) loss 8.1263 (7.8321) grad_norm 2.1790 (2.1672) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:02:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][520/625] eta 0:01:01 lr 0.000929 wd 0.0500 time 0.5656 (0.5876) data time 0.0007 (0.0015) model time 0.5649 (0.5851) loss 7.6756 (7.8343) grad_norm 2.9643 (2.1680) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:02:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][530/625] eta 0:00:55 lr 0.000929 wd 0.0500 time 0.5676 (0.5875) data time 0.0006 (0.0015) model time 0.5670 (0.5849) loss 7.0091 (7.8362) grad_norm 2.3804 (2.1672) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:02:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][540/625] eta 0:00:49 lr 0.000928 wd 0.0500 time 0.5668 (0.5873) data time 0.0008 (0.0015) model time 0.5660 (0.5847) loss 7.7619 (7.8397) grad_norm 2.7889 (2.1764) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:02:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][550/625] eta 0:00:44 lr 0.000928 wd 0.0500 time 0.5639 (0.5871) data time 0.0008 (0.0015) model time 0.5631 (0.5846) loss 7.1254 (7.8373) grad_norm 2.3064 (2.1875) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:03:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][560/625] eta 0:00:38 lr 0.000928 wd 0.0500 time 0.5707 (0.5869) data time 0.0006 (0.0015) model time 0.5700 (0.5844) loss 7.5861 (7.8402) grad_norm 1.8211 (2.1833) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:03:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][570/625] eta 0:00:32 lr 0.000928 wd 0.0500 time 0.5672 (0.5868) data time 0.0006 (0.0015) model time 0.5667 (0.5843) loss 8.8071 (7.8438) grad_norm 1.8532 (2.1815) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:03:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][580/625] eta 0:00:26 lr 0.000928 wd 0.0500 time 0.5687 (0.5866) data time 0.0008 (0.0015) model time 0.5680 (0.5841) loss 8.6943 (7.8418) grad_norm 1.8806 (2.1800) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:03:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][590/625] eta 0:00:20 lr 0.000928 wd 0.0500 time 0.5666 (0.5865) data time 0.0006 (0.0015) model time 0.5660 (0.5840) loss 6.6135 (7.8405) grad_norm 1.8162 (2.1799) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:03:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][600/625] eta 0:00:14 lr 0.000928 wd 0.0500 time 0.5693 (0.5863) data time 0.0006 (0.0014) model time 0.5687 (0.5839) loss 7.9418 (7.8433) grad_norm 3.0873 (2.1832) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:03:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][610/625] eta 0:00:08 lr 0.000928 wd 0.0500 time 0.5615 (0.5862) data time 0.0004 (0.0014) model time 0.5610 (0.5838) loss 7.7357 (7.8382) grad_norm 1.9342 (2.1872) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:03:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [108/300][620/625] eta 0:00:02 lr 0.000928 wd 0.0500 time 0.5634 (0.5860) data time 0.0004 (0.0014) model time 0.5630 (0.5836) loss 9.1017 (7.8378) grad_norm 2.1373 (2.1851) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:03:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 108 training takes 0:06:06 +[2024-07-25 01:03:41 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 01:03:42 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 01:03:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.473 (0.473) Loss 0.5723 (0.5723) Acc@1 88.672 (88.672) Acc@5 98.486 (98.486) Mem 22339MB +[2024-07-25 01:03:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.8828 (0.6986) Acc@1 78.955 (85.112) Acc@5 95.654 (97.443) Mem 22339MB +[2024-07-25 01:03:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 1.0352 (0.8211) Acc@1 75.146 (81.678) Acc@5 94.336 (96.101) Mem 22339MB +[2024-07-25 01:03:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.332 Acc@5 96.057 +[2024-07-25 01:03:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.3% +[2024-07-25 01:03:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 1.093 (1.093) Loss 0.5122 (0.5122) Acc@1 89.404 (89.404) Acc@5 98.584 (98.584) Mem 22339MB +[2024-07-25 01:03:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.214) Loss 0.8276 (0.6469) Acc@1 80.615 (86.004) Acc@5 96.045 (97.652) Mem 22339MB +[2024-07-25 01:03:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.172) Loss 0.9385 (0.7598) Acc@1 76.123 (82.685) Acc@5 95.215 (96.473) Mem 22339MB +[2024-07-25 01:03:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.370 Acc@5 96.459 +[2024-07-25 01:03:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.4% +[2024-07-25 01:03:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.37% +[2024-07-25 01:03:50 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 01:03:51 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 01:03:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][0/625] eta 0:09:24 lr 0.000928 wd 0.0500 time 0.9038 (0.9038) data time 0.3862 (0.3862) model time 0.0000 (0.0000) loss 8.6136 (8.6136) grad_norm 2.3196 (2.3196) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:03:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][10/625] eta 0:06:13 lr 0.000928 wd 0.0500 time 0.5638 (0.6069) data time 0.0006 (0.0358) model time 0.0000 (0.0000) loss 7.6867 (7.9115) grad_norm 1.7293 (2.3112) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:04:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][20/625] eta 0:05:59 lr 0.000927 wd 0.0500 time 0.5631 (0.5936) data time 0.0007 (0.0192) model time 0.0000 (0.0000) loss 8.6221 (7.8825) grad_norm 2.5086 (2.4412) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:04:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][30/625] eta 0:05:53 lr 0.000927 wd 0.0500 time 0.5614 (0.5936) data time 0.0006 (0.0132) model time 0.0000 (0.0000) loss 7.4241 (7.9729) grad_norm 2.5767 (2.4032) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:04:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][40/625] eta 0:05:44 lr 0.000927 wd 0.0500 time 0.5655 (0.5892) data time 0.0008 (0.0102) model time 0.0000 (0.0000) loss 7.8822 (7.8431) grad_norm 2.0612 (2.3158) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:04:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][50/625] eta 0:05:37 lr 0.000927 wd 0.0500 time 0.5625 (0.5867) data time 0.0006 (0.0084) model time 0.0000 (0.0000) loss 7.8761 (7.8641) grad_norm 1.7432 (2.2199) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:04:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][60/625] eta 0:05:30 lr 0.000927 wd 0.0500 time 0.5616 (0.5852) data time 0.0008 (0.0072) model time 0.5607 (0.5767) loss 7.2422 (7.8283) grad_norm 1.6345 (2.2056) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:04:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][70/625] eta 0:05:24 lr 0.000927 wd 0.0500 time 0.5666 (0.5843) data time 0.0008 (0.0063) model time 0.5659 (0.5770) loss 8.3348 (7.8421) grad_norm 2.0082 (2.1697) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:04:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][80/625] eta 0:05:19 lr 0.000927 wd 0.0500 time 0.7531 (0.5871) data time 0.0006 (0.0056) model time 0.7525 (0.5868) loss 5.8195 (7.8178) grad_norm 1.8273 (2.1581) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:04:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][90/625] eta 0:05:17 lr 0.000927 wd 0.0500 time 0.5615 (0.5931) data time 0.0007 (0.0051) model time 0.5608 (0.6003) loss 9.2960 (7.8009) grad_norm 2.5374 (2.1455) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:04:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][100/625] eta 0:05:13 lr 0.000927 wd 0.0500 time 0.7281 (0.5975) data time 0.0008 (0.0047) model time 0.7273 (0.6076) loss 8.3441 (7.8166) grad_norm 1.6980 (2.1096) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:04:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][110/625] eta 0:05:07 lr 0.000927 wd 0.0500 time 0.5628 (0.5977) data time 0.0008 (0.0043) model time 0.5620 (0.6061) loss 6.7593 (7.7781) grad_norm 1.7648 (2.1049) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:05:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][120/625] eta 0:05:00 lr 0.000927 wd 0.0500 time 0.5611 (0.5960) data time 0.0006 (0.0040) model time 0.5606 (0.6019) loss 7.0968 (7.7796) grad_norm 1.9409 (2.0911) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:05:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][130/625] eta 0:04:54 lr 0.000926 wd 0.0500 time 0.5621 (0.5946) data time 0.0008 (0.0038) model time 0.5612 (0.5987) loss 9.0475 (7.8310) grad_norm 1.7316 (2.0702) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:05:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][140/625] eta 0:04:47 lr 0.000926 wd 0.0500 time 0.5649 (0.5934) data time 0.0008 (0.0036) model time 0.5641 (0.5962) loss 8.2123 (7.8126) grad_norm 3.2641 (2.0665) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:05:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][150/625] eta 0:04:41 lr 0.000926 wd 0.0500 time 0.5647 (0.5923) data time 0.0006 (0.0034) model time 0.5641 (0.5942) loss 7.8788 (7.8088) grad_norm 1.7740 (2.0627) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:05:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][160/625] eta 0:04:34 lr 0.000926 wd 0.0500 time 0.5649 (0.5913) data time 0.0006 (0.0032) model time 0.5643 (0.5926) loss 7.7461 (7.8109) grad_norm 1.7481 (2.0588) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:05:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][170/625] eta 0:04:28 lr 0.000926 wd 0.0500 time 0.5630 (0.5906) data time 0.0008 (0.0031) model time 0.5621 (0.5914) loss 8.5064 (7.8268) grad_norm 1.8763 (2.0449) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:05:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][180/625] eta 0:04:22 lr 0.000926 wd 0.0500 time 0.5718 (0.5899) data time 0.0006 (0.0030) model time 0.5712 (0.5903) loss 7.8117 (7.8313) grad_norm 2.0386 (2.0533) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:05:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][190/625] eta 0:04:16 lr 0.000926 wd 0.0500 time 0.5605 (0.5893) data time 0.0006 (0.0029) model time 0.5599 (0.5893) loss 7.7720 (7.8353) grad_norm 2.1134 (2.0682) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:05:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][200/625] eta 0:04:10 lr 0.000926 wd 0.0500 time 0.5661 (0.5887) data time 0.0009 (0.0028) model time 0.5652 (0.5884) loss 8.0673 (7.8554) grad_norm 2.0351 (2.0699) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:05:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][210/625] eta 0:04:04 lr 0.000926 wd 0.0500 time 0.5652 (0.5882) data time 0.0008 (0.0027) model time 0.5644 (0.5877) loss 5.8886 (7.8279) grad_norm 2.4107 (2.0828) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:06:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][220/625] eta 0:03:58 lr 0.000926 wd 0.0500 time 0.5623 (0.5877) data time 0.0006 (0.0026) model time 0.5617 (0.5871) loss 6.3617 (7.8052) grad_norm 1.8894 (2.0903) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:06:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][230/625] eta 0:03:51 lr 0.000926 wd 0.0500 time 0.5653 (0.5873) data time 0.0008 (0.0025) model time 0.5645 (0.5866) loss 7.8019 (7.7827) grad_norm 1.8325 (2.0960) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:06:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][240/625] eta 0:03:45 lr 0.000925 wd 0.0500 time 0.5671 (0.5869) data time 0.0009 (0.0025) model time 0.5662 (0.5861) loss 6.9093 (7.7907) grad_norm 1.5348 (2.0897) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:06:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][250/625] eta 0:03:40 lr 0.000925 wd 0.0500 time 0.5691 (0.5872) data time 0.0006 (0.0024) model time 0.5685 (0.5864) loss 7.2676 (7.7722) grad_norm 1.6523 (2.0857) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:06:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][260/625] eta 0:03:34 lr 0.000925 wd 0.0500 time 0.5626 (0.5868) data time 0.0008 (0.0023) model time 0.5618 (0.5860) loss 6.2210 (7.7632) grad_norm 1.9935 (2.0924) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:06:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][270/625] eta 0:03:28 lr 0.000925 wd 0.0500 time 0.5645 (0.5865) data time 0.0006 (0.0023) model time 0.5639 (0.5856) loss 6.7213 (7.7458) grad_norm 1.4688 (2.0954) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:06:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][280/625] eta 0:03:22 lr 0.000925 wd 0.0500 time 0.5669 (0.5862) data time 0.0006 (0.0022) model time 0.5663 (0.5852) loss 6.4758 (7.7499) grad_norm 1.8196 (2.0924) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:06:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][290/625] eta 0:03:16 lr 0.000925 wd 0.0500 time 0.5699 (0.5859) data time 0.0008 (0.0022) model time 0.5691 (0.5848) loss 7.0808 (7.7525) grad_norm 2.2551 (2.0961) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:06:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][300/625] eta 0:03:10 lr 0.000925 wd 0.0500 time 0.7248 (0.5866) data time 0.0006 (0.0022) model time 0.7242 (0.5856) loss 7.2440 (7.7386) grad_norm 2.2365 (2.0966) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:06:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][310/625] eta 0:03:05 lr 0.000925 wd 0.0500 time 0.7056 (0.5897) data time 0.0008 (0.0021) model time 0.7047 (0.5893) loss 6.6027 (7.7282) grad_norm 1.7875 (2.1031) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:07:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][320/625] eta 0:03:00 lr 0.000925 wd 0.0500 time 0.7360 (0.5910) data time 0.0006 (0.0021) model time 0.7354 (0.5909) loss 8.2299 (7.7249) grad_norm 2.0839 (2.0983) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:07:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][330/625] eta 0:02:54 lr 0.000925 wd 0.0500 time 0.5651 (0.5907) data time 0.0008 (0.0021) model time 0.5643 (0.5906) loss 7.3698 (7.7090) grad_norm 1.9947 (2.0970) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:07:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][340/625] eta 0:02:48 lr 0.000925 wd 0.0500 time 0.5627 (0.5904) data time 0.0006 (0.0020) model time 0.5620 (0.5901) loss 7.0387 (7.7070) grad_norm 2.7330 (2.1008) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:07:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][350/625] eta 0:02:42 lr 0.000925 wd 0.0500 time 0.5657 (0.5902) data time 0.0006 (0.0020) model time 0.5651 (0.5899) loss 6.5508 (7.6990) grad_norm 1.8013 (2.1045) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:07:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][360/625] eta 0:02:36 lr 0.000924 wd 0.0500 time 0.5653 (0.5900) data time 0.0006 (0.0020) model time 0.5647 (0.5896) loss 6.3630 (7.7043) grad_norm 3.1144 (2.0992) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:07:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][370/625] eta 0:02:30 lr 0.000924 wd 0.0500 time 0.5697 (0.5897) data time 0.0006 (0.0019) model time 0.5691 (0.5893) loss 6.8090 (7.6971) grad_norm 1.8191 (2.0996) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:07:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][380/625] eta 0:02:24 lr 0.000924 wd 0.0500 time 0.5700 (0.5894) data time 0.0008 (0.0019) model time 0.5692 (0.5890) loss 6.9093 (7.6985) grad_norm 1.9855 (2.1058) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:07:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][390/625] eta 0:02:18 lr 0.000924 wd 0.0500 time 0.5638 (0.5892) data time 0.0008 (0.0019) model time 0.5630 (0.5887) loss 9.0191 (7.7112) grad_norm 1.7307 (2.1069) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:07:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][400/625] eta 0:02:12 lr 0.000924 wd 0.0500 time 0.5676 (0.5889) data time 0.0006 (0.0018) model time 0.5670 (0.5883) loss 6.9917 (7.7022) grad_norm 1.8669 (2.1078) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:07:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][410/625] eta 0:02:06 lr 0.000924 wd 0.0500 time 0.5658 (0.5886) data time 0.0007 (0.0018) model time 0.5651 (0.5880) loss 7.6405 (7.7023) grad_norm 2.8683 (2.1073) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:07:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][420/625] eta 0:02:00 lr 0.000924 wd 0.0500 time 0.5643 (0.5883) data time 0.0006 (0.0018) model time 0.5637 (0.5876) loss 7.1629 (7.6978) grad_norm 1.7619 (2.1083) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:08:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][430/625] eta 0:01:54 lr 0.000924 wd 0.0500 time 0.5673 (0.5880) data time 0.0007 (0.0018) model time 0.5666 (0.5873) loss 7.5256 (7.7076) grad_norm 1.9555 (2.1112) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:08:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][440/625] eta 0:01:48 lr 0.000924 wd 0.0500 time 0.5709 (0.5878) data time 0.0006 (0.0018) model time 0.5703 (0.5871) loss 9.1216 (7.7172) grad_norm 2.1181 (2.1128) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:08:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][450/625] eta 0:01:42 lr 0.000924 wd 0.0500 time 0.5689 (0.5876) data time 0.0009 (0.0017) model time 0.5681 (0.5868) loss 10.0154 (7.7232) grad_norm 2.8671 (2.1156) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:08:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][460/625] eta 0:01:36 lr 0.000924 wd 0.0500 time 0.5645 (0.5873) data time 0.0010 (0.0017) model time 0.5635 (0.5865) loss 7.2912 (7.7168) grad_norm 2.5242 (2.1215) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:08:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][470/625] eta 0:01:31 lr 0.000923 wd 0.0500 time 0.5710 (0.5874) data time 0.0008 (0.0017) model time 0.5702 (0.5866) loss 8.1613 (7.7281) grad_norm 2.3957 (2.1276) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:08:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][480/625] eta 0:01:25 lr 0.000923 wd 0.0500 time 0.5627 (0.5872) data time 0.0008 (0.0017) model time 0.5619 (0.5864) loss 7.8143 (7.7214) grad_norm 1.6446 (2.1358) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:08:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][490/625] eta 0:01:19 lr 0.000923 wd 0.0500 time 0.5630 (0.5870) data time 0.0008 (0.0017) model time 0.5622 (0.5861) loss 7.9397 (7.7129) grad_norm 3.1991 (2.1439) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:08:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][500/625] eta 0:01:13 lr 0.000923 wd 0.0500 time 0.5643 (0.5868) data time 0.0006 (0.0016) model time 0.5637 (0.5859) loss 7.8479 (7.7101) grad_norm 2.0946 (2.1407) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:08:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][510/625] eta 0:01:07 lr 0.000923 wd 0.0500 time 0.5659 (0.5866) data time 0.0009 (0.0016) model time 0.5650 (0.5857) loss 9.1077 (7.7148) grad_norm 1.4608 (2.1375) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:08:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][520/625] eta 0:01:01 lr 0.000923 wd 0.0500 time 0.7624 (0.5868) data time 0.0008 (0.0016) model time 0.7616 (0.5860) loss 9.0548 (7.7132) grad_norm 2.5199 (2.1474) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:09:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][530/625] eta 0:00:55 lr 0.000923 wd 0.0500 time 0.7044 (0.5884) data time 0.0008 (0.0016) model time 0.7036 (0.5877) loss 8.3887 (7.7110) grad_norm 1.8282 (2.1484) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:09:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][540/625] eta 0:00:50 lr 0.000923 wd 0.0500 time 0.5638 (0.5891) data time 0.0006 (0.0016) model time 0.5632 (0.5885) loss 7.3671 (7.7159) grad_norm 4.3944 (2.1521) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:09:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][550/625] eta 0:00:44 lr 0.000923 wd 0.0500 time 0.5666 (0.5896) data time 0.0006 (0.0016) model time 0.5659 (0.5890) loss 7.8727 (7.7123) grad_norm 2.3972 (2.1512) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:09:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][560/625] eta 0:00:38 lr 0.000923 wd 0.0500 time 0.5641 (0.5893) data time 0.0008 (0.0016) model time 0.5633 (0.5887) loss 8.2840 (7.7088) grad_norm 1.8398 (2.1460) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:09:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][570/625] eta 0:00:32 lr 0.000923 wd 0.0500 time 0.5635 (0.5891) data time 0.0008 (0.0015) model time 0.5627 (0.5885) loss 7.7398 (7.7029) grad_norm 2.1094 (2.1436) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:09:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][580/625] eta 0:00:26 lr 0.000922 wd 0.0500 time 0.5640 (0.5892) data time 0.0008 (0.0015) model time 0.5632 (0.5885) loss 9.8799 (7.7039) grad_norm 2.3285 (2.1416) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:09:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][590/625] eta 0:00:20 lr 0.000922 wd 0.0500 time 0.5609 (0.5892) data time 0.0008 (0.0016) model time 0.5601 (0.5885) loss 8.7296 (7.7084) grad_norm 3.3789 (2.1494) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:09:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][600/625] eta 0:00:14 lr 0.000922 wd 0.0500 time 0.5655 (0.5891) data time 0.0006 (0.0015) model time 0.5649 (0.5884) loss 8.2242 (7.7067) grad_norm 2.8958 (2.1530) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:09:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][610/625] eta 0:00:08 lr 0.000922 wd 0.0500 time 0.5625 (0.5889) data time 0.0006 (0.0015) model time 0.5619 (0.5881) loss 10.0749 (7.7028) grad_norm 1.8829 (2.1495) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:09:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [109/300][620/625] eta 0:00:02 lr 0.000922 wd 0.0500 time 0.5628 (0.5887) data time 0.0004 (0.0015) model time 0.5624 (0.5879) loss 7.9324 (7.7038) grad_norm 1.8608 (2.1525) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:09:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 109 training takes 0:06:07 +[2024-07-25 01:09:59 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 01:10:01 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 01:10:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.477 (0.477) Loss 0.5068 (0.5068) Acc@1 89.209 (89.209) Acc@5 98.682 (98.682) Mem 22339MB +[2024-07-25 01:10:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.8330 (0.6627) Acc@1 80.273 (85.272) Acc@5 95.605 (97.505) Mem 22339MB +[2024-07-25 01:10:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.9619 (0.7855) Acc@1 75.977 (81.929) Acc@5 94.287 (96.187) Mem 22339MB +[2024-07-25 01:10:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.628 Acc@5 96.163 +[2024-07-25 01:10:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.6% +[2024-07-25 01:10:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.850 (0.850) Loss 0.5103 (0.5103) Acc@1 89.453 (89.453) Acc@5 98.584 (98.584) Mem 22339MB +[2024-07-25 01:10:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.192) Loss 0.8242 (0.6454) Acc@1 80.469 (85.960) Acc@5 96.045 (97.643) Mem 22339MB +[2024-07-25 01:10:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.160) Loss 0.9380 (0.7584) Acc@1 76.123 (82.678) Acc@5 95.264 (96.470) Mem 22339MB +[2024-07-25 01:10:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.362 Acc@5 96.463 +[2024-07-25 01:10:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.4% +[2024-07-25 01:10:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][0/625] eta 0:14:37 lr 0.000922 wd 0.0500 time 1.4039 (1.4039) data time 0.7643 (0.7643) model time 0.0000 (0.0000) loss 9.6871 (9.6871) grad_norm 1.7502 (1.7502) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:10:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][10/625] eta 0:06:43 lr 0.000922 wd 0.0500 time 0.6055 (0.6554) data time 0.0007 (0.0703) model time 0.0000 (0.0000) loss 8.1832 (8.0470) grad_norm 1.9453 (2.3921) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:10:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][20/625] eta 0:06:14 lr 0.000922 wd 0.0500 time 0.5832 (0.6188) data time 0.0008 (0.0375) model time 0.0000 (0.0000) loss 8.7269 (8.1180) grad_norm 3.7382 (2.7699) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:10:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][30/625] eta 0:06:02 lr 0.000922 wd 0.0500 time 0.6731 (0.6089) data time 0.0006 (0.0256) model time 0.0000 (0.0000) loss 6.8294 (8.1103) grad_norm 1.9422 (2.6909) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:10:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][40/625] eta 0:05:50 lr 0.000922 wd 0.0500 time 0.5819 (0.6000) data time 0.0006 (0.0196) model time 0.0000 (0.0000) loss 8.9031 (7.9618) grad_norm 3.6545 (2.6849) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:10:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][50/625] eta 0:05:42 lr 0.000922 wd 0.0500 time 0.5842 (0.5952) data time 0.0006 (0.0159) model time 0.0000 (0.0000) loss 7.4790 (7.9745) grad_norm 1.5539 (2.5920) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:10:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][60/625] eta 0:05:34 lr 0.000921 wd 0.0500 time 0.5767 (0.5921) data time 0.0008 (0.0134) model time 0.5759 (0.5759) loss 7.0858 (7.8760) grad_norm 1.5976 (2.4581) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:10:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][70/625] eta 0:05:27 lr 0.000921 wd 0.0500 time 0.5807 (0.5901) data time 0.0008 (0.0117) model time 0.5799 (0.5763) loss 8.0287 (7.8038) grad_norm 1.7319 (2.3711) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:10:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][80/625] eta 0:05:20 lr 0.000921 wd 0.0500 time 0.5794 (0.5885) data time 0.0007 (0.0103) model time 0.5787 (0.5763) loss 6.6693 (7.8269) grad_norm 1.8461 (2.3349) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:11:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][90/625] eta 0:05:14 lr 0.000921 wd 0.0500 time 0.5793 (0.5872) data time 0.0009 (0.0093) model time 0.5784 (0.5762) loss 8.5330 (7.8560) grad_norm 2.1379 (2.3259) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:11:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][100/625] eta 0:05:07 lr 0.000921 wd 0.0500 time 0.5918 (0.5864) data time 0.0006 (0.0085) model time 0.5912 (0.5767) loss 9.3844 (7.8600) grad_norm 2.1093 (2.3050) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:11:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][110/625] eta 0:05:01 lr 0.000921 wd 0.0500 time 0.5921 (0.5858) data time 0.0006 (0.0078) model time 0.5915 (0.5770) loss 7.6513 (7.8554) grad_norm 1.5487 (2.3020) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:11:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][120/625] eta 0:04:58 lr 0.000921 wd 0.0500 time 0.7444 (0.5905) data time 0.0007 (0.0072) model time 0.7437 (0.5863) loss 6.8207 (7.8679) grad_norm 2.6083 (2.3490) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:11:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][130/625] eta 0:04:53 lr 0.000921 wd 0.0500 time 0.7394 (0.5937) data time 0.0006 (0.0067) model time 0.7389 (0.5920) loss 7.0838 (7.8646) grad_norm 1.9802 (2.3687) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:11:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][140/625] eta 0:04:48 lr 0.000921 wd 0.0500 time 0.5777 (0.5955) data time 0.0008 (0.0063) model time 0.5769 (0.5949) loss 8.5076 (7.8702) grad_norm 2.2302 (2.3456) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:11:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][150/625] eta 0:04:42 lr 0.000921 wd 0.0500 time 0.5881 (0.5954) data time 0.0007 (0.0059) model time 0.5873 (0.5947) loss 8.0321 (7.8890) grad_norm 4.1387 (2.3524) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:11:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][160/625] eta 0:04:36 lr 0.000921 wd 0.0500 time 0.5801 (0.5943) data time 0.0007 (0.0056) model time 0.5793 (0.5931) loss 8.6129 (7.8973) grad_norm 1.7085 (2.3824) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:11:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][170/625] eta 0:04:30 lr 0.000920 wd 0.0500 time 0.5870 (0.5934) data time 0.0009 (0.0054) model time 0.5862 (0.5918) loss 9.2528 (7.9074) grad_norm 1.5976 (2.3596) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:11:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][180/625] eta 0:04:23 lr 0.000920 wd 0.0500 time 0.5782 (0.5927) data time 0.0006 (0.0051) model time 0.5776 (0.5909) loss 7.8716 (7.9376) grad_norm 1.8126 (2.3416) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:12:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][190/625] eta 0:04:17 lr 0.000920 wd 0.0500 time 0.5871 (0.5919) data time 0.0007 (0.0049) model time 0.5864 (0.5899) loss 7.4276 (7.9284) grad_norm 2.3184 (2.3141) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:12:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][200/625] eta 0:04:11 lr 0.000920 wd 0.0500 time 0.5809 (0.5912) data time 0.0007 (0.0047) model time 0.5802 (0.5890) loss 8.5016 (7.9184) grad_norm 1.5481 (2.2998) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:12:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][210/625] eta 0:04:05 lr 0.000920 wd 0.0500 time 0.5774 (0.5904) data time 0.0008 (0.0045) model time 0.5766 (0.5880) loss 7.9725 (7.9184) grad_norm 2.9243 (2.2902) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:12:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][220/625] eta 0:03:58 lr 0.000920 wd 0.0500 time 0.5814 (0.5898) data time 0.0006 (0.0043) model time 0.5808 (0.5873) loss 7.9782 (7.9311) grad_norm 1.6697 (2.2748) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:12:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][230/625] eta 0:03:52 lr 0.000920 wd 0.0500 time 0.5811 (0.5892) data time 0.0006 (0.0042) model time 0.5804 (0.5867) loss 6.6218 (7.9297) grad_norm 2.4860 (inf) loss_scale 2048.0000 (4069.4026) mem 22339MB +[2024-07-25 01:12:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][240/625] eta 0:03:46 lr 0.000920 wd 0.0500 time 0.5829 (0.5889) data time 0.0006 (0.0040) model time 0.5823 (0.5863) loss 8.7591 (7.9232) grad_norm 1.8692 (inf) loss_scale 2048.0000 (3985.5270) mem 22339MB +[2024-07-25 01:12:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][250/625] eta 0:03:40 lr 0.000920 wd 0.0500 time 0.5852 (0.5884) data time 0.0006 (0.0039) model time 0.5846 (0.5859) loss 6.8531 (7.9019) grad_norm 2.3391 (inf) loss_scale 2048.0000 (3908.3347) mem 22339MB +[2024-07-25 01:12:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][260/625] eta 0:03:34 lr 0.000920 wd 0.0500 time 0.6105 (0.5883) data time 0.0006 (0.0038) model time 0.6098 (0.5858) loss 7.2735 (7.8987) grad_norm 1.9782 (inf) loss_scale 2048.0000 (3837.0575) mem 22339MB +[2024-07-25 01:12:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][270/625] eta 0:03:28 lr 0.000920 wd 0.0500 time 0.6102 (0.5883) data time 0.0006 (0.0037) model time 0.6096 (0.5858) loss 9.1205 (7.9131) grad_norm 2.3806 (inf) loss_scale 2048.0000 (3771.0406) mem 22339MB +[2024-07-25 01:12:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][280/625] eta 0:03:22 lr 0.000919 wd 0.0500 time 0.5842 (0.5880) data time 0.0007 (0.0036) model time 0.5835 (0.5855) loss 7.6371 (7.8960) grad_norm 2.1521 (inf) loss_scale 2048.0000 (3709.7224) mem 22339MB +[2024-07-25 01:12:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][290/625] eta 0:03:16 lr 0.000919 wd 0.0500 time 0.5805 (0.5876) data time 0.0008 (0.0035) model time 0.5798 (0.5851) loss 8.7427 (7.8851) grad_norm 2.3284 (inf) loss_scale 2048.0000 (3652.6186) mem 22339MB +[2024-07-25 01:13:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][300/625] eta 0:03:10 lr 0.000919 wd 0.0500 time 0.5880 (0.5873) data time 0.0008 (0.0034) model time 0.5872 (0.5848) loss 8.6654 (7.8752) grad_norm 2.7241 (inf) loss_scale 2048.0000 (3599.3090) mem 22339MB +[2024-07-25 01:13:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][310/625] eta 0:03:04 lr 0.000919 wd 0.0500 time 0.5916 (0.5871) data time 0.0008 (0.0034) model time 0.5908 (0.5846) loss 8.1572 (7.8776) grad_norm 1.6915 (inf) loss_scale 2048.0000 (3549.4277) mem 22339MB +[2024-07-25 01:13:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][320/625] eta 0:02:58 lr 0.000919 wd 0.0500 time 0.5825 (0.5868) data time 0.0006 (0.0033) model time 0.5820 (0.5843) loss 9.0149 (7.8877) grad_norm 1.8746 (inf) loss_scale 2048.0000 (3502.6542) mem 22339MB +[2024-07-25 01:13:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][330/625] eta 0:02:53 lr 0.000919 wd 0.0500 time 0.5839 (0.5866) data time 0.0006 (0.0032) model time 0.5834 (0.5841) loss 7.7971 (7.8856) grad_norm 2.3819 (inf) loss_scale 2048.0000 (3458.7069) mem 22339MB +[2024-07-25 01:13:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][340/625] eta 0:02:47 lr 0.000919 wd 0.0500 time 0.7120 (0.5885) data time 0.0006 (0.0031) model time 0.7114 (0.5864) loss 7.4334 (7.8854) grad_norm 1.7647 (inf) loss_scale 2048.0000 (3417.3372) mem 22339MB +[2024-07-25 01:13:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][350/625] eta 0:02:42 lr 0.000919 wd 0.0500 time 0.6789 (0.5903) data time 0.0009 (0.0031) model time 0.6781 (0.5886) loss 7.1693 (7.8937) grad_norm 1.7983 (inf) loss_scale 2048.0000 (3378.3248) mem 22339MB +[2024-07-25 01:13:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][360/625] eta 0:02:36 lr 0.000919 wd 0.0500 time 0.5820 (0.5908) data time 0.0006 (0.0030) model time 0.5814 (0.5892) loss 6.5017 (7.8798) grad_norm 1.8044 (inf) loss_scale 2048.0000 (3341.4737) mem 22339MB +[2024-07-25 01:13:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][370/625] eta 0:02:30 lr 0.000919 wd 0.0500 time 0.5809 (0.5909) data time 0.0008 (0.0030) model time 0.5801 (0.5893) loss 8.7688 (7.8887) grad_norm 1.7657 (inf) loss_scale 2048.0000 (3306.6092) mem 22339MB +[2024-07-25 01:13:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][380/625] eta 0:02:24 lr 0.000919 wd 0.0500 time 0.5776 (0.5905) data time 0.0008 (0.0029) model time 0.5768 (0.5889) loss 9.0195 (7.8870) grad_norm 2.0738 (inf) loss_scale 2048.0000 (3273.5748) mem 22339MB +[2024-07-25 01:13:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][390/625] eta 0:02:18 lr 0.000918 wd 0.0500 time 0.5808 (0.5901) data time 0.0008 (0.0029) model time 0.5800 (0.5885) loss 8.7627 (7.8865) grad_norm 2.2968 (inf) loss_scale 2048.0000 (3242.2302) mem 22339MB +[2024-07-25 01:14:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][400/625] eta 0:02:12 lr 0.000918 wd 0.0500 time 0.5855 (0.5898) data time 0.0006 (0.0028) model time 0.5849 (0.5882) loss 8.6347 (7.8869) grad_norm 1.9875 (inf) loss_scale 2048.0000 (3212.4489) mem 22339MB +[2024-07-25 01:14:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][410/625] eta 0:02:06 lr 0.000918 wd 0.0500 time 0.5834 (0.5895) data time 0.0007 (0.0028) model time 0.5827 (0.5878) loss 6.8009 (7.8920) grad_norm 4.0164 (inf) loss_scale 2048.0000 (3184.1168) mem 22339MB +[2024-07-25 01:14:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][420/625] eta 0:02:00 lr 0.000918 wd 0.0500 time 0.5790 (0.5892) data time 0.0006 (0.0027) model time 0.5783 (0.5875) loss 7.4857 (7.8973) grad_norm 2.0116 (inf) loss_scale 2048.0000 (3157.1306) mem 22339MB +[2024-07-25 01:14:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][430/625] eta 0:01:54 lr 0.000918 wd 0.0500 time 0.5814 (0.5890) data time 0.0008 (0.0027) model time 0.5806 (0.5872) loss 7.0918 (7.8837) grad_norm 1.5369 (inf) loss_scale 2048.0000 (3131.3968) mem 22339MB +[2024-07-25 01:14:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][440/625] eta 0:01:48 lr 0.000918 wd 0.0500 time 0.5794 (0.5887) data time 0.0008 (0.0026) model time 0.5787 (0.5869) loss 8.8536 (7.8763) grad_norm 2.3614 (inf) loss_scale 2048.0000 (3106.8299) mem 22339MB +[2024-07-25 01:14:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][450/625] eta 0:01:42 lr 0.000918 wd 0.0500 time 0.5933 (0.5884) data time 0.0007 (0.0026) model time 0.5926 (0.5867) loss 9.0526 (7.8710) grad_norm 1.5854 (inf) loss_scale 2048.0000 (3083.3525) mem 22339MB +[2024-07-25 01:14:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][460/625] eta 0:01:37 lr 0.000918 wd 0.0500 time 0.5788 (0.5882) data time 0.0008 (0.0025) model time 0.5781 (0.5864) loss 8.2687 (7.8724) grad_norm 1.8334 (inf) loss_scale 2048.0000 (3060.8937) mem 22339MB +[2024-07-25 01:14:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][470/625] eta 0:01:31 lr 0.000918 wd 0.0500 time 0.5844 (0.5880) data time 0.0006 (0.0025) model time 0.5838 (0.5862) loss 5.9926 (7.8703) grad_norm 1.7418 (inf) loss_scale 2048.0000 (3039.3885) mem 22339MB +[2024-07-25 01:14:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][480/625] eta 0:01:25 lr 0.000918 wd 0.0500 time 0.5798 (0.5879) data time 0.0008 (0.0025) model time 0.5790 (0.5861) loss 6.5783 (7.8649) grad_norm 1.6665 (inf) loss_scale 2048.0000 (3018.7775) mem 22339MB +[2024-07-25 01:14:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][490/625] eta 0:01:19 lr 0.000918 wd 0.0500 time 0.5826 (0.5877) data time 0.0008 (0.0024) model time 0.5818 (0.5859) loss 7.6154 (7.8557) grad_norm 2.5185 (inf) loss_scale 2048.0000 (2999.0061) mem 22339MB +[2024-07-25 01:15:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][500/625] eta 0:01:13 lr 0.000917 wd 0.0500 time 0.5805 (0.5874) data time 0.0008 (0.0024) model time 0.5797 (0.5857) loss 6.3293 (7.8494) grad_norm 1.6527 (inf) loss_scale 2048.0000 (2980.0240) mem 22339MB +[2024-07-25 01:15:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][510/625] eta 0:01:07 lr 0.000917 wd 0.0500 time 0.5852 (0.5873) data time 0.0009 (0.0024) model time 0.5844 (0.5855) loss 8.1867 (7.8537) grad_norm 1.5828 (inf) loss_scale 2048.0000 (2961.7847) mem 22339MB +[2024-07-25 01:15:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][520/625] eta 0:01:01 lr 0.000917 wd 0.0500 time 0.5824 (0.5871) data time 0.0006 (0.0024) model time 0.5817 (0.5854) loss 8.5051 (7.8562) grad_norm 1.5422 (inf) loss_scale 2048.0000 (2944.2457) mem 22339MB +[2024-07-25 01:15:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][530/625] eta 0:00:55 lr 0.000917 wd 0.0500 time 0.5819 (0.5869) data time 0.0006 (0.0023) model time 0.5813 (0.5852) loss 6.5432 (7.8604) grad_norm 2.0707 (inf) loss_scale 2048.0000 (2927.3672) mem 22339MB +[2024-07-25 01:15:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][540/625] eta 0:00:49 lr 0.000917 wd 0.0500 time 0.5805 (0.5867) data time 0.0006 (0.0023) model time 0.5799 (0.5850) loss 8.0377 (7.8626) grad_norm 2.3111 (inf) loss_scale 2048.0000 (2911.1128) mem 22339MB +[2024-07-25 01:15:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][550/625] eta 0:00:43 lr 0.000917 wd 0.0500 time 0.5799 (0.5866) data time 0.0006 (0.0023) model time 0.5793 (0.5848) loss 7.9211 (7.8641) grad_norm 2.0554 (inf) loss_scale 2048.0000 (2895.4483) mem 22339MB +[2024-07-25 01:15:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][560/625] eta 0:00:38 lr 0.000917 wd 0.0500 time 0.7496 (0.5873) data time 0.0008 (0.0022) model time 0.7488 (0.5857) loss 5.7891 (7.8575) grad_norm 1.9273 (inf) loss_scale 2048.0000 (2880.3422) mem 22339MB +[2024-07-25 01:15:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][570/625] eta 0:00:32 lr 0.000917 wd 0.0500 time 0.7211 (0.5886) data time 0.0008 (0.0022) model time 0.7203 (0.5871) loss 6.4827 (7.8496) grad_norm 1.8872 (inf) loss_scale 2048.0000 (2865.7653) mem 22339MB +[2024-07-25 01:15:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][580/625] eta 0:00:26 lr 0.000917 wd 0.0500 time 0.5878 (0.5894) data time 0.0008 (0.0022) model time 0.5870 (0.5880) loss 7.8039 (7.8417) grad_norm 2.4471 (inf) loss_scale 2048.0000 (2851.6902) mem 22339MB +[2024-07-25 01:15:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][590/625] eta 0:00:20 lr 0.000917 wd 0.0500 time 0.5844 (0.5893) data time 0.0008 (0.0022) model time 0.5836 (0.5879) loss 6.7449 (7.8464) grad_norm 2.7674 (inf) loss_scale 2048.0000 (2838.0914) mem 22339MB +[2024-07-25 01:16:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][600/625] eta 0:00:14 lr 0.000917 wd 0.0500 time 0.5786 (0.5891) data time 0.0007 (0.0022) model time 0.5779 (0.5877) loss 8.7634 (7.8444) grad_norm 2.4664 (inf) loss_scale 2048.0000 (2824.9451) mem 22339MB +[2024-07-25 01:16:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][610/625] eta 0:00:08 lr 0.000917 wd 0.0500 time 0.5868 (0.5889) data time 0.0006 (0.0021) model time 0.5862 (0.5875) loss 7.2151 (7.8450) grad_norm 2.1509 (inf) loss_scale 2048.0000 (2812.2291) mem 22339MB +[2024-07-25 01:16:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [110/300][620/625] eta 0:00:02 lr 0.000916 wd 0.0500 time 0.5837 (0.5888) data time 0.0006 (0.0021) model time 0.5831 (0.5873) loss 8.6792 (7.8385) grad_norm 3.1163 (inf) loss_scale 2048.0000 (2799.9227) mem 22339MB +[2024-07-25 01:16:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 110 training takes 0:06:07 +[2024-07-25 01:16:16 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 01:16:17 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 01:16:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.482 (0.482) Loss 0.5342 (0.5342) Acc@1 88.916 (88.916) Acc@5 98.486 (98.486) Mem 22339MB +[2024-07-25 01:16:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.8906 (0.6791) Acc@1 79.004 (85.285) Acc@5 95.117 (97.314) Mem 22339MB +[2024-07-25 01:16:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 1.0264 (0.8026) Acc@1 75.049 (81.903) Acc@5 93.994 (96.031) Mem 22339MB +[2024-07-25 01:16:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.544 Acc@5 96.023 +[2024-07-25 01:16:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.5% +[2024-07-25 01:16:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.836 (0.836) Loss 0.5093 (0.5093) Acc@1 89.502 (89.502) Acc@5 98.584 (98.584) Mem 22339MB +[2024-07-25 01:16:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.190) Loss 0.8228 (0.6444) Acc@1 80.566 (85.986) Acc@5 95.996 (97.638) Mem 22339MB +[2024-07-25 01:16:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.159) Loss 0.9370 (0.7572) Acc@1 76.270 (82.699) Acc@5 95.264 (96.466) Mem 22339MB +[2024-07-25 01:16:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.376 Acc@5 96.465 +[2024-07-25 01:16:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.4% +[2024-07-25 01:16:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.38% +[2024-07-25 01:16:25 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 01:16:26 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 01:16:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][0/625] eta 0:09:35 lr 0.000916 wd 0.0500 time 0.9204 (0.9204) data time 0.4019 (0.4019) model time 0.0000 (0.0000) loss 8.7138 (8.7138) grad_norm 1.8294 (1.8294) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:16:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][10/625] eta 0:06:23 lr 0.000916 wd 0.0500 time 0.5647 (0.6229) data time 0.0006 (0.0372) model time 0.0000 (0.0000) loss 9.2757 (7.5789) grad_norm 3.6204 (2.1990) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:16:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][20/625] eta 0:06:03 lr 0.000916 wd 0.0500 time 0.5628 (0.6010) data time 0.0006 (0.0199) model time 0.0000 (0.0000) loss 9.9827 (7.7369) grad_norm 2.2590 (2.0344) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:16:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][30/625] eta 0:05:52 lr 0.000916 wd 0.0500 time 0.5619 (0.5930) data time 0.0006 (0.0137) model time 0.0000 (0.0000) loss 8.0249 (7.8905) grad_norm 2.6206 (2.0530) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:16:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][40/625] eta 0:05:46 lr 0.000916 wd 0.0500 time 0.5645 (0.5918) data time 0.0006 (0.0106) model time 0.0000 (0.0000) loss 6.5475 (7.6853) grad_norm 1.6660 (2.0661) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:16:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][50/625] eta 0:05:38 lr 0.000916 wd 0.0500 time 0.5610 (0.5890) data time 0.0009 (0.0087) model time 0.0000 (0.0000) loss 8.1202 (7.6731) grad_norm 3.0791 (2.0538) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:17:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][60/625] eta 0:05:31 lr 0.000916 wd 0.0500 time 0.5647 (0.5870) data time 0.0008 (0.0074) model time 0.5640 (0.5763) loss 6.0769 (7.7320) grad_norm 2.0719 (2.1158) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:17:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][70/625] eta 0:05:25 lr 0.000916 wd 0.0500 time 0.5716 (0.5856) data time 0.0009 (0.0065) model time 0.5707 (0.5763) loss 6.7246 (7.6694) grad_norm 2.1984 (2.1834) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:17:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][80/625] eta 0:05:18 lr 0.000916 wd 0.0500 time 0.5599 (0.5845) data time 0.0007 (0.0058) model time 0.5593 (0.5761) loss 9.2984 (7.7201) grad_norm 3.1674 (2.2698) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:17:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][90/625] eta 0:05:12 lr 0.000916 wd 0.0500 time 0.5638 (0.5836) data time 0.0008 (0.0053) model time 0.5631 (0.5758) loss 6.8303 (7.7922) grad_norm 2.1940 (2.2855) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:17:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][100/625] eta 0:05:06 lr 0.000915 wd 0.0500 time 0.5641 (0.5831) data time 0.0008 (0.0048) model time 0.5633 (0.5761) loss 8.5241 (7.7898) grad_norm 2.3881 (2.2792) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:17:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][110/625] eta 0:05:00 lr 0.000915 wd 0.0500 time 0.5681 (0.5826) data time 0.0006 (0.0045) model time 0.5674 (0.5764) loss 8.2562 (7.7875) grad_norm 4.7636 (2.3087) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:17:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][120/625] eta 0:04:54 lr 0.000915 wd 0.0500 time 0.5648 (0.5822) data time 0.0008 (0.0042) model time 0.5640 (0.5764) loss 8.9282 (7.7444) grad_norm 1.6054 (2.3006) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:17:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][130/625] eta 0:04:47 lr 0.000915 wd 0.0500 time 0.5659 (0.5818) data time 0.0008 (0.0039) model time 0.5651 (0.5764) loss 9.9419 (7.7424) grad_norm 2.6108 (2.2733) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:17:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][140/625] eta 0:04:42 lr 0.000915 wd 0.0500 time 0.5649 (0.5815) data time 0.0009 (0.0037) model time 0.5640 (0.5764) loss 8.0214 (7.7629) grad_norm 1.9993 (2.2544) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:17:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][150/625] eta 0:04:36 lr 0.000915 wd 0.0500 time 0.5659 (0.5813) data time 0.0006 (0.0035) model time 0.5653 (0.5765) loss 8.8349 (7.7482) grad_norm 2.0454 (2.2464) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:18:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][160/625] eta 0:04:32 lr 0.000915 wd 0.0500 time 0.7578 (0.5860) data time 0.0008 (0.0033) model time 0.7570 (0.5839) loss 8.0655 (7.7548) grad_norm 2.2877 (2.2356) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:18:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][170/625] eta 0:04:28 lr 0.000915 wd 0.0500 time 0.5684 (0.5893) data time 0.0008 (0.0032) model time 0.5676 (0.5885) loss 8.6578 (7.7807) grad_norm 2.5098 (2.2399) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:18:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][180/625] eta 0:04:22 lr 0.000915 wd 0.0500 time 0.5679 (0.5908) data time 0.0006 (0.0031) model time 0.5673 (0.5907) loss 7.2007 (7.7497) grad_norm 2.2330 (2.2285) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:18:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][190/625] eta 0:04:16 lr 0.000915 wd 0.0500 time 0.5700 (0.5903) data time 0.0008 (0.0029) model time 0.5692 (0.5899) loss 6.9850 (7.7338) grad_norm 2.4325 (2.2391) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:18:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][200/625] eta 0:04:10 lr 0.000915 wd 0.0500 time 0.5633 (0.5896) data time 0.0008 (0.0028) model time 0.5625 (0.5890) loss 7.3528 (7.7319) grad_norm 1.8421 (2.2393) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:18:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][210/625] eta 0:04:04 lr 0.000914 wd 0.0500 time 0.5663 (0.5891) data time 0.0006 (0.0028) model time 0.5656 (0.5883) loss 5.7779 (7.7160) grad_norm 1.8578 (2.2305) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:18:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][220/625] eta 0:03:58 lr 0.000914 wd 0.0500 time 0.5648 (0.5886) data time 0.0006 (0.0027) model time 0.5642 (0.5876) loss 7.3471 (7.7225) grad_norm 2.5663 (2.2269) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:18:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][230/625] eta 0:03:52 lr 0.000914 wd 0.0500 time 0.5624 (0.5888) data time 0.0008 (0.0026) model time 0.5617 (0.5879) loss 8.3697 (7.7306) grad_norm 2.2247 (2.2404) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:18:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][240/625] eta 0:03:46 lr 0.000914 wd 0.0500 time 0.5622 (0.5883) data time 0.0006 (0.0025) model time 0.5616 (0.5872) loss 7.1567 (7.7362) grad_norm 1.7474 (2.2289) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:18:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][250/625] eta 0:03:40 lr 0.000914 wd 0.0500 time 0.5645 (0.5878) data time 0.0007 (0.0025) model time 0.5638 (0.5867) loss 7.6705 (7.7617) grad_norm 1.7662 (2.2140) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:18:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][260/625] eta 0:03:34 lr 0.000914 wd 0.0500 time 0.5646 (0.5875) data time 0.0006 (0.0024) model time 0.5640 (0.5863) loss 6.9228 (7.7744) grad_norm 3.3272 (2.2127) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:19:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][270/625] eta 0:03:28 lr 0.000914 wd 0.0500 time 0.5717 (0.5872) data time 0.0008 (0.0023) model time 0.5709 (0.5859) loss 6.5181 (7.7636) grad_norm 1.8234 (2.2220) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:19:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][280/625] eta 0:03:22 lr 0.000914 wd 0.0500 time 0.5664 (0.5868) data time 0.0008 (0.0023) model time 0.5656 (0.5855) loss 7.4367 (7.7421) grad_norm 2.1745 (2.2144) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:19:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][290/625] eta 0:03:16 lr 0.000914 wd 0.0500 time 0.5800 (0.5865) data time 0.0008 (0.0022) model time 0.5793 (0.5852) loss 7.1127 (7.7410) grad_norm 1.9680 (2.2059) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:19:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][300/625] eta 0:03:10 lr 0.000914 wd 0.0500 time 0.5665 (0.5862) data time 0.0008 (0.0022) model time 0.5657 (0.5848) loss 8.1233 (7.7414) grad_norm 3.3953 (2.2047) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:19:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][310/625] eta 0:03:04 lr 0.000914 wd 0.0500 time 0.5683 (0.5860) data time 0.0008 (0.0021) model time 0.5675 (0.5845) loss 9.3716 (7.7646) grad_norm 1.6303 (2.1937) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:19:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][320/625] eta 0:02:58 lr 0.000913 wd 0.0500 time 0.5706 (0.5857) data time 0.0008 (0.0021) model time 0.5698 (0.5842) loss 7.7815 (7.7767) grad_norm 3.2236 (2.2026) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:19:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][330/625] eta 0:02:52 lr 0.000913 wd 0.0500 time 0.5657 (0.5854) data time 0.0008 (0.0021) model time 0.5649 (0.5839) loss 7.2807 (7.7809) grad_norm 2.1678 (2.2183) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:19:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][340/625] eta 0:02:46 lr 0.000913 wd 0.0500 time 0.5647 (0.5852) data time 0.0008 (0.0020) model time 0.5639 (0.5837) loss 7.9381 (7.7835) grad_norm 1.8388 (2.2144) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:19:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][350/625] eta 0:02:40 lr 0.000913 wd 0.0500 time 0.5644 (0.5850) data time 0.0006 (0.0020) model time 0.5638 (0.5835) loss 8.6514 (7.7753) grad_norm 1.7117 (2.2075) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:19:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][360/625] eta 0:02:34 lr 0.000913 wd 0.0500 time 0.5653 (0.5849) data time 0.0007 (0.0020) model time 0.5647 (0.5833) loss 6.9630 (7.7697) grad_norm 2.1999 (2.2116) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:20:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][370/625] eta 0:02:29 lr 0.000913 wd 0.0500 time 0.7359 (0.5851) data time 0.0007 (0.0019) model time 0.7352 (0.5836) loss 8.0164 (7.7812) grad_norm 3.3622 (2.2238) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:20:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][380/625] eta 0:02:23 lr 0.000913 wd 0.0500 time 0.7356 (0.5874) data time 0.0006 (0.0019) model time 0.7350 (0.5863) loss 6.8241 (7.7833) grad_norm 1.7449 (2.2131) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:20:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][390/625] eta 0:02:18 lr 0.000913 wd 0.0500 time 0.5637 (0.5885) data time 0.0008 (0.0019) model time 0.5629 (0.5875) loss 6.3246 (7.7700) grad_norm 1.6672 (2.2064) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:20:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][400/625] eta 0:02:12 lr 0.000913 wd 0.0500 time 0.5632 (0.5893) data time 0.0006 (0.0019) model time 0.5626 (0.5884) loss 6.2241 (7.7530) grad_norm 2.2802 (2.2044) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:20:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][410/625] eta 0:02:06 lr 0.000913 wd 0.0500 time 0.5648 (0.5890) data time 0.0007 (0.0019) model time 0.5641 (0.5880) loss 6.7992 (7.7609) grad_norm 2.1069 (2.1974) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:20:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][420/625] eta 0:02:00 lr 0.000913 wd 0.0500 time 0.5617 (0.5887) data time 0.0006 (0.0019) model time 0.5610 (0.5877) loss 6.6959 (7.7549) grad_norm 2.2306 (2.1938) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:20:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][430/625] eta 0:01:54 lr 0.000912 wd 0.0500 time 0.5695 (0.5885) data time 0.0008 (0.0019) model time 0.5687 (0.5875) loss 9.2273 (7.7663) grad_norm 2.5709 (2.1958) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:20:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][440/625] eta 0:01:48 lr 0.000912 wd 0.0500 time 0.5667 (0.5884) data time 0.0008 (0.0018) model time 0.5660 (0.5874) loss 7.2459 (7.7616) grad_norm 2.4397 (2.1909) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:20:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][450/625] eta 0:01:42 lr 0.000912 wd 0.0500 time 0.5682 (0.5885) data time 0.0006 (0.0018) model time 0.5676 (0.5875) loss 9.9749 (7.7641) grad_norm 3.5154 (2.2034) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:20:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][460/625] eta 0:01:37 lr 0.000912 wd 0.0500 time 0.5626 (0.5883) data time 0.0008 (0.0018) model time 0.5618 (0.5873) loss 7.5450 (7.7612) grad_norm 1.9640 (2.2025) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:21:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][470/625] eta 0:01:31 lr 0.000912 wd 0.0500 time 0.5696 (0.5881) data time 0.0011 (0.0018) model time 0.5685 (0.5871) loss 6.3977 (7.7439) grad_norm 2.8711 (2.2008) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:21:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][480/625] eta 0:01:25 lr 0.000912 wd 0.0500 time 0.5661 (0.5879) data time 0.0008 (0.0018) model time 0.5652 (0.5869) loss 8.3630 (7.7437) grad_norm 1.6726 (2.1973) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:21:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][490/625] eta 0:01:19 lr 0.000912 wd 0.0500 time 0.5621 (0.5878) data time 0.0007 (0.0018) model time 0.5614 (0.5867) loss 8.9637 (7.7429) grad_norm 3.0343 (2.2007) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:21:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][500/625] eta 0:01:13 lr 0.000912 wd 0.0500 time 0.5715 (0.5876) data time 0.0008 (0.0017) model time 0.5707 (0.5865) loss 8.8245 (7.7447) grad_norm 2.5816 (2.2002) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:21:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][510/625] eta 0:01:07 lr 0.000912 wd 0.0500 time 0.5623 (0.5874) data time 0.0008 (0.0017) model time 0.5616 (0.5863) loss 8.2314 (7.7424) grad_norm 1.6667 (2.1951) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:21:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][520/625] eta 0:01:01 lr 0.000912 wd 0.0500 time 0.5671 (0.5873) data time 0.0008 (0.0017) model time 0.5663 (0.5861) loss 9.4357 (7.7316) grad_norm 2.2251 (2.1886) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:21:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][530/625] eta 0:00:55 lr 0.000912 wd 0.0500 time 0.5689 (0.5871) data time 0.0007 (0.0017) model time 0.5682 (0.5860) loss 6.4925 (7.7179) grad_norm 2.0170 (2.1887) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:21:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][540/625] eta 0:00:49 lr 0.000911 wd 0.0500 time 0.5639 (0.5869) data time 0.0008 (0.0017) model time 0.5631 (0.5858) loss 8.7701 (7.7240) grad_norm 1.9366 (2.1916) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:21:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][550/625] eta 0:00:44 lr 0.000911 wd 0.0500 time 0.5720 (0.5868) data time 0.0008 (0.0017) model time 0.5712 (0.5856) loss 8.5098 (7.7351) grad_norm 2.6450 (2.1915) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:21:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][560/625] eta 0:00:38 lr 0.000911 wd 0.0500 time 0.5656 (0.5866) data time 0.0007 (0.0016) model time 0.5649 (0.5855) loss 7.0303 (7.7424) grad_norm 1.9980 (2.1888) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:22:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][570/625] eta 0:00:32 lr 0.000911 wd 0.0500 time 0.5699 (0.5865) data time 0.0008 (0.0016) model time 0.5691 (0.5853) loss 9.1578 (7.7406) grad_norm 2.8937 (2.1952) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:22:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][580/625] eta 0:00:26 lr 0.000911 wd 0.0500 time 0.5769 (0.5864) data time 0.0008 (0.0016) model time 0.5761 (0.5852) loss 7.2292 (7.7329) grad_norm 2.1746 (2.1932) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:22:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][590/625] eta 0:00:20 lr 0.000911 wd 0.0500 time 0.7463 (0.5865) data time 0.0008 (0.0016) model time 0.7456 (0.5853) loss 7.5803 (7.7337) grad_norm 1.5996 (2.1963) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:22:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][600/625] eta 0:00:14 lr 0.000911 wd 0.0500 time 0.7393 (0.5878) data time 0.0007 (0.0016) model time 0.7385 (0.5867) loss 8.5343 (7.7387) grad_norm 1.5488 (2.1996) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:22:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][610/625] eta 0:00:08 lr 0.000911 wd 0.0500 time 0.6800 (0.5886) data time 0.0004 (0.0016) model time 0.6796 (0.5877) loss 7.0047 (7.7432) grad_norm 3.2013 (2.1989) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:22:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [111/300][620/625] eta 0:00:02 lr 0.000911 wd 0.0500 time 0.5627 (0.5890) data time 0.0004 (0.0016) model time 0.5623 (0.5881) loss 8.1927 (7.7490) grad_norm 1.9270 (2.2049) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:22:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 111 training takes 0:06:08 +[2024-07-25 01:22:34 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 01:22:36 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 01:22:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.481 (0.481) Loss 0.5366 (0.5366) Acc@1 88.818 (88.818) Acc@5 98.682 (98.682) Mem 22339MB +[2024-07-25 01:22:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.8418 (0.6759) Acc@1 79.932 (85.427) Acc@5 95.752 (97.421) Mem 22339MB +[2024-07-25 01:22:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9951 (0.8014) Acc@1 75.684 (81.887) Acc@5 94.287 (96.101) Mem 22339MB +[2024-07-25 01:22:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.622 Acc@5 96.073 +[2024-07-25 01:22:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.6% +[2024-07-25 01:22:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.815 (0.815) Loss 0.5073 (0.5073) Acc@1 89.404 (89.404) Acc@5 98.633 (98.633) Mem 22339MB +[2024-07-25 01:22:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.188) Loss 0.8208 (0.6428) Acc@1 80.566 (85.964) Acc@5 95.850 (97.625) Mem 22339MB +[2024-07-25 01:22:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.158) Loss 0.9360 (0.7555) Acc@1 76.221 (82.710) Acc@5 95.215 (96.461) Mem 22339MB +[2024-07-25 01:22:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.382 Acc@5 96.465 +[2024-07-25 01:22:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.4% +[2024-07-25 01:22:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.38% +[2024-07-25 01:22:43 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 01:22:45 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 01:22:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][0/625] eta 0:09:18 lr 0.000911 wd 0.0500 time 0.8936 (0.8936) data time 0.3763 (0.3763) model time 0.0000 (0.0000) loss 8.0988 (8.0988) grad_norm 2.2767 (2.2767) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:22:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][10/625] eta 0:06:12 lr 0.000911 wd 0.0500 time 0.5642 (0.6051) data time 0.0008 (0.0350) model time 0.0000 (0.0000) loss 8.6484 (7.8954) grad_norm 1.9565 (2.2695) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:22:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][20/625] eta 0:05:58 lr 0.000910 wd 0.0500 time 0.5663 (0.5923) data time 0.0006 (0.0187) model time 0.0000 (0.0000) loss 8.1735 (7.9011) grad_norm 1.8464 (2.2040) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:23:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][30/625] eta 0:05:49 lr 0.000910 wd 0.0500 time 0.5628 (0.5874) data time 0.0007 (0.0130) model time 0.0000 (0.0000) loss 8.1053 (8.0743) grad_norm 3.1269 (2.2843) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:23:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][40/625] eta 0:05:42 lr 0.000910 wd 0.0500 time 0.5690 (0.5853) data time 0.0008 (0.0101) model time 0.0000 (0.0000) loss 7.2791 (7.9140) grad_norm 1.8319 (2.1782) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:23:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][50/625] eta 0:05:35 lr 0.000910 wd 0.0500 time 0.5680 (0.5840) data time 0.0008 (0.0083) model time 0.0000 (0.0000) loss 7.4811 (7.8120) grad_norm 2.1257 (2.1630) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:23:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][60/625] eta 0:05:29 lr 0.000910 wd 0.0500 time 0.5642 (0.5829) data time 0.0008 (0.0071) model time 0.5634 (0.5762) loss 8.7536 (7.7422) grad_norm 1.5213 (2.1560) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:23:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][70/625] eta 0:05:23 lr 0.000910 wd 0.0500 time 0.5641 (0.5821) data time 0.0008 (0.0062) model time 0.5633 (0.5764) loss 6.0683 (7.7020) grad_norm 1.6404 (2.1246) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:23:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][80/625] eta 0:05:16 lr 0.000910 wd 0.0500 time 0.5656 (0.5815) data time 0.0007 (0.0055) model time 0.5649 (0.5763) loss 7.2510 (7.7106) grad_norm 2.5494 (2.1408) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:23:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][90/625] eta 0:05:10 lr 0.000910 wd 0.0500 time 0.5702 (0.5810) data time 0.0007 (0.0050) model time 0.5695 (0.5764) loss 8.4122 (7.7480) grad_norm 2.6304 (2.0998) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:23:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][100/625] eta 0:05:04 lr 0.000910 wd 0.0500 time 0.5621 (0.5805) data time 0.0006 (0.0046) model time 0.5615 (0.5761) loss 8.9325 (7.7192) grad_norm 2.2748 (2.1222) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:23:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][110/625] eta 0:04:58 lr 0.000910 wd 0.0500 time 0.5634 (0.5801) data time 0.0007 (0.0043) model time 0.5627 (0.5760) loss 8.5800 (7.7206) grad_norm 1.4835 (2.1107) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:23:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][120/625] eta 0:04:53 lr 0.000910 wd 0.0500 time 0.5628 (0.5804) data time 0.0007 (0.0040) model time 0.5622 (0.5769) loss 7.3560 (7.7089) grad_norm 2.1266 (2.1466) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:24:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][130/625] eta 0:04:47 lr 0.000909 wd 0.0500 time 0.5643 (0.5808) data time 0.0006 (0.0038) model time 0.5637 (0.5779) loss 6.4726 (7.6748) grad_norm 2.5521 (2.1811) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:24:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][140/625] eta 0:04:41 lr 0.000909 wd 0.0500 time 0.5657 (0.5810) data time 0.0006 (0.0036) model time 0.5651 (0.5784) loss 8.2700 (7.6755) grad_norm 1.6184 (2.1727) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:24:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][150/625] eta 0:04:35 lr 0.000909 wd 0.0500 time 0.5634 (0.5807) data time 0.0008 (0.0034) model time 0.5626 (0.5781) loss 8.6606 (7.6791) grad_norm 1.2586 (2.1549) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:24:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][160/625] eta 0:04:30 lr 0.000909 wd 0.0500 time 0.5651 (0.5807) data time 0.0006 (0.0033) model time 0.5645 (0.5782) loss 7.6297 (7.6717) grad_norm 1.9620 (2.1490) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:24:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][170/625] eta 0:04:24 lr 0.000909 wd 0.0500 time 0.5625 (0.5804) data time 0.0008 (0.0031) model time 0.5617 (0.5779) loss 7.2534 (7.6693) grad_norm 2.9638 (2.1553) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:24:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][180/625] eta 0:04:18 lr 0.000909 wd 0.0500 time 0.5618 (0.5802) data time 0.0006 (0.0030) model time 0.5612 (0.5777) loss 6.9017 (7.6699) grad_norm 2.6944 (2.1553) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:24:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][190/625] eta 0:04:13 lr 0.000909 wd 0.0500 time 0.5655 (0.5830) data time 0.0007 (0.0029) model time 0.5648 (0.5816) loss 7.2555 (7.6822) grad_norm 2.4917 (2.1607) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:24:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][200/625] eta 0:04:10 lr 0.000909 wd 0.0500 time 0.6819 (0.5884) data time 0.0006 (0.0028) model time 0.6813 (0.5889) loss 7.8899 (7.6669) grad_norm 2.8256 (2.1651) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:24:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][210/625] eta 0:04:04 lr 0.000909 wd 0.0500 time 0.5644 (0.5889) data time 0.0008 (0.0027) model time 0.5636 (0.5895) loss 8.3806 (7.6649) grad_norm 2.6197 (2.1625) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:24:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][220/625] eta 0:03:58 lr 0.000909 wd 0.0500 time 0.5610 (0.5893) data time 0.0008 (0.0026) model time 0.5602 (0.5900) loss 9.1454 (7.6662) grad_norm 2.5300 (2.1674) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:25:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][230/625] eta 0:03:52 lr 0.000909 wd 0.0500 time 0.5648 (0.5890) data time 0.0008 (0.0026) model time 0.5640 (0.5894) loss 9.2308 (7.6729) grad_norm 3.2803 (2.1663) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:25:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][240/625] eta 0:03:46 lr 0.000908 wd 0.0500 time 0.5631 (0.5894) data time 0.0008 (0.0025) model time 0.5623 (0.5898) loss 8.7974 (7.7004) grad_norm 1.9317 (2.1568) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:25:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][250/625] eta 0:03:40 lr 0.000908 wd 0.0500 time 0.5649 (0.5891) data time 0.0008 (0.0025) model time 0.5640 (0.5893) loss 6.4649 (7.7216) grad_norm 2.6419 (2.1601) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:25:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][260/625] eta 0:03:34 lr 0.000908 wd 0.0500 time 0.5631 (0.5886) data time 0.0006 (0.0024) model time 0.5625 (0.5887) loss 8.4717 (7.7400) grad_norm 1.7767 (2.1553) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:25:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][270/625] eta 0:03:28 lr 0.000908 wd 0.0500 time 0.5685 (0.5882) data time 0.0006 (0.0024) model time 0.5678 (0.5881) loss 7.4428 (7.7176) grad_norm 2.2825 (2.1561) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:25:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][280/625] eta 0:03:22 lr 0.000908 wd 0.0500 time 0.5639 (0.5878) data time 0.0006 (0.0023) model time 0.5633 (0.5876) loss 6.9275 (7.7203) grad_norm 1.4689 (2.1532) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:25:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][290/625] eta 0:03:16 lr 0.000908 wd 0.0500 time 0.5630 (0.5874) data time 0.0006 (0.0023) model time 0.5624 (0.5872) loss 6.4936 (7.7140) grad_norm 6.3236 (2.1772) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:25:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][300/625] eta 0:03:10 lr 0.000908 wd 0.0500 time 0.5651 (0.5871) data time 0.0006 (0.0022) model time 0.5645 (0.5867) loss 8.2300 (7.7212) grad_norm 2.0946 (2.1870) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:25:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][310/625] eta 0:03:04 lr 0.000908 wd 0.0500 time 0.5633 (0.5868) data time 0.0008 (0.0022) model time 0.5625 (0.5863) loss 9.0035 (7.7206) grad_norm 2.5518 (2.1877) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:25:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][320/625] eta 0:02:58 lr 0.000908 wd 0.0500 time 0.5656 (0.5865) data time 0.0007 (0.0021) model time 0.5650 (0.5861) loss 8.5767 (7.7296) grad_norm 1.4931 (2.1838) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:25:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][330/625] eta 0:02:53 lr 0.000908 wd 0.0500 time 0.5628 (0.5868) data time 0.0007 (0.0021) model time 0.5621 (0.5863) loss 8.9861 (7.7455) grad_norm 1.7892 (2.1771) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:26:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][340/625] eta 0:02:47 lr 0.000908 wd 0.0500 time 0.5665 (0.5866) data time 0.0008 (0.0021) model time 0.5657 (0.5861) loss 8.1385 (7.7522) grad_norm 1.9223 (2.1775) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:26:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][350/625] eta 0:02:41 lr 0.000907 wd 0.0500 time 0.5644 (0.5864) data time 0.0008 (0.0021) model time 0.5636 (0.5857) loss 6.2739 (7.7464) grad_norm 1.5538 (2.1677) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:26:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][360/625] eta 0:02:35 lr 0.000907 wd 0.0500 time 0.5674 (0.5861) data time 0.0007 (0.0021) model time 0.5667 (0.5854) loss 7.6082 (7.7325) grad_norm 4.2583 (2.1880) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:26:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][370/625] eta 0:02:29 lr 0.000907 wd 0.0500 time 0.5615 (0.5862) data time 0.0006 (0.0020) model time 0.5608 (0.5855) loss 7.4227 (7.7228) grad_norm 2.1058 (2.1881) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:26:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][380/625] eta 0:02:23 lr 0.000907 wd 0.0500 time 0.5680 (0.5860) data time 0.0006 (0.0020) model time 0.5674 (0.5853) loss 6.9627 (7.7276) grad_norm 1.8141 (2.1816) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:26:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][390/625] eta 0:02:17 lr 0.000907 wd 0.0500 time 0.5639 (0.5858) data time 0.0007 (0.0020) model time 0.5632 (0.5851) loss 7.7899 (7.7318) grad_norm 2.0793 (2.1794) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:26:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][400/625] eta 0:02:11 lr 0.000907 wd 0.0500 time 0.5696 (0.5856) data time 0.0006 (0.0019) model time 0.5690 (0.5848) loss 6.4112 (7.7406) grad_norm 1.9666 (2.1775) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:26:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][410/625] eta 0:02:06 lr 0.000907 wd 0.0500 time 0.5632 (0.5866) data time 0.0006 (0.0019) model time 0.5626 (0.5860) loss 6.4130 (7.7440) grad_norm 2.1322 (2.1769) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:26:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][420/625] eta 0:02:00 lr 0.000907 wd 0.0500 time 0.7152 (0.5883) data time 0.0007 (0.0019) model time 0.7145 (0.5879) loss 5.7749 (7.7365) grad_norm 2.1638 (2.1733) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:26:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][430/625] eta 0:01:54 lr 0.000907 wd 0.0500 time 0.5659 (0.5890) data time 0.0006 (0.0019) model time 0.5653 (0.5886) loss 6.6849 (7.7335) grad_norm 1.9253 (2.1688) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:27:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][440/625] eta 0:01:49 lr 0.000907 wd 0.0500 time 0.5639 (0.5892) data time 0.0006 (0.0018) model time 0.5633 (0.5889) loss 7.3393 (7.7406) grad_norm 1.7223 (2.1681) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:27:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][450/625] eta 0:01:43 lr 0.000907 wd 0.0500 time 0.5658 (0.5890) data time 0.0008 (0.0018) model time 0.5651 (0.5886) loss 8.0290 (7.7538) grad_norm 3.3691 (2.1708) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:27:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][460/625] eta 0:01:37 lr 0.000906 wd 0.0500 time 0.5633 (0.5887) data time 0.0007 (0.0018) model time 0.5626 (0.5883) loss 8.6855 (7.7388) grad_norm 1.8784 (2.1748) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:27:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][470/625] eta 0:01:31 lr 0.000906 wd 0.0500 time 0.5632 (0.5885) data time 0.0007 (0.0018) model time 0.5625 (0.5881) loss 8.4051 (7.7319) grad_norm 2.1089 (2.1712) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:27:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][480/625] eta 0:01:25 lr 0.000906 wd 0.0500 time 0.5692 (0.5883) data time 0.0008 (0.0017) model time 0.5684 (0.5878) loss 8.1321 (7.7232) grad_norm 1.9715 (2.1767) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:27:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][490/625] eta 0:01:19 lr 0.000906 wd 0.0500 time 0.5627 (0.5881) data time 0.0008 (0.0017) model time 0.5619 (0.5876) loss 7.6358 (7.7265) grad_norm 1.8488 (2.1709) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:27:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][500/625] eta 0:01:13 lr 0.000906 wd 0.0500 time 0.5659 (0.5879) data time 0.0007 (0.0017) model time 0.5652 (0.5874) loss 7.9266 (7.7282) grad_norm 2.3378 (2.1706) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:27:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][510/625] eta 0:01:07 lr 0.000906 wd 0.0500 time 0.5690 (0.5878) data time 0.0008 (0.0017) model time 0.5682 (0.5872) loss 9.2539 (7.7278) grad_norm 3.8189 (2.1760) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:27:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][520/625] eta 0:01:01 lr 0.000906 wd 0.0500 time 0.5611 (0.5877) data time 0.0008 (0.0017) model time 0.5603 (0.5871) loss 9.3954 (7.7331) grad_norm 2.9237 (2.1827) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:27:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][530/625] eta 0:00:55 lr 0.000906 wd 0.0500 time 0.5649 (0.5877) data time 0.0008 (0.0017) model time 0.5641 (0.5871) loss 6.0287 (7.7262) grad_norm 2.0762 (2.1891) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:28:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][540/625] eta 0:00:49 lr 0.000906 wd 0.0500 time 0.5687 (0.5875) data time 0.0006 (0.0017) model time 0.5681 (0.5869) loss 8.8332 (7.7353) grad_norm 1.5748 (2.1829) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:28:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][550/625] eta 0:00:44 lr 0.000906 wd 0.0500 time 0.5620 (0.5873) data time 0.0006 (0.0017) model time 0.5615 (0.5867) loss 8.4601 (7.7358) grad_norm 1.6565 (2.1763) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:28:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][560/625] eta 0:00:38 lr 0.000906 wd 0.0500 time 0.5616 (0.5872) data time 0.0006 (0.0016) model time 0.5610 (0.5865) loss 6.1292 (7.7281) grad_norm 1.4806 (2.1711) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:28:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][570/625] eta 0:00:32 lr 0.000905 wd 0.0500 time 0.5658 (0.5870) data time 0.0008 (0.0016) model time 0.5649 (0.5863) loss 9.5986 (7.7361) grad_norm 1.6913 (2.1639) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:28:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][580/625] eta 0:00:26 lr 0.000905 wd 0.0500 time 0.5695 (0.5869) data time 0.0008 (0.0016) model time 0.5688 (0.5862) loss 8.6157 (7.7335) grad_norm 2.6476 (2.1626) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:28:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][590/625] eta 0:00:20 lr 0.000905 wd 0.0500 time 0.5649 (0.5867) data time 0.0007 (0.0016) model time 0.5641 (0.5860) loss 8.3089 (7.7336) grad_norm 1.9089 (2.1648) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:28:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][600/625] eta 0:00:14 lr 0.000905 wd 0.0500 time 0.5698 (0.5866) data time 0.0008 (0.0016) model time 0.5690 (0.5859) loss 8.1156 (7.7323) grad_norm 1.9488 (2.1635) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:28:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][610/625] eta 0:00:08 lr 0.000905 wd 0.0500 time 0.5589 (0.5865) data time 0.0006 (0.0016) model time 0.5583 (0.5857) loss 8.1290 (7.7253) grad_norm 1.7011 (2.1628) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:28:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [112/300][620/625] eta 0:00:02 lr 0.000905 wd 0.0500 time 0.5582 (0.5865) data time 0.0006 (0.0016) model time 0.5576 (0.5857) loss 9.0381 (7.7313) grad_norm 1.7349 (2.1620) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:28:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 112 training takes 0:06:06 +[2024-07-25 01:28:51 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 01:28:53 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 01:28:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.471 (0.471) Loss 0.5361 (0.5361) Acc@1 88.330 (88.330) Acc@5 98.584 (98.584) Mem 22339MB +[2024-07-25 01:28:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.8818 (0.6733) Acc@1 79.492 (85.387) Acc@5 95.557 (97.470) Mem 22339MB +[2024-07-25 01:28:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9795 (0.7956) Acc@1 75.732 (82.057) Acc@5 94.482 (96.138) Mem 22339MB +[2024-07-25 01:28:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.694 Acc@5 96.119 +[2024-07-25 01:28:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.7% +[2024-07-25 01:28:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.794 (0.794) Loss 0.5063 (0.5063) Acc@1 89.307 (89.307) Acc@5 98.633 (98.633) Mem 22339MB +[2024-07-25 01:28:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.187) Loss 0.8198 (0.6419) Acc@1 80.615 (85.986) Acc@5 95.996 (97.647) Mem 22339MB +[2024-07-25 01:29:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.158) Loss 0.9341 (0.7545) Acc@1 76.221 (82.719) Acc@5 95.068 (96.477) Mem 22339MB +[2024-07-25 01:29:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.396 Acc@5 96.487 +[2024-07-25 01:29:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.4% +[2024-07-25 01:29:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.40% +[2024-07-25 01:29:00 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 01:29:02 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 01:29:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][0/625] eta 0:09:04 lr 0.000905 wd 0.0500 time 0.8718 (0.8718) data time 0.3527 (0.3527) model time 0.0000 (0.0000) loss 8.2907 (8.2907) grad_norm 1.5378 (1.5378) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:29:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][10/625] eta 0:06:44 lr 0.000905 wd 0.0500 time 0.7208 (0.6572) data time 0.0008 (0.0329) model time 0.0000 (0.0000) loss 6.8613 (7.8947) grad_norm 2.0306 (1.8314) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:29:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][20/625] eta 0:06:35 lr 0.000905 wd 0.0500 time 0.5644 (0.6529) data time 0.0006 (0.0176) model time 0.0000 (0.0000) loss 8.0984 (7.8221) grad_norm 2.3963 (2.2895) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:29:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][30/625] eta 0:06:16 lr 0.000905 wd 0.0500 time 0.5646 (0.6335) data time 0.0008 (0.0122) model time 0.0000 (0.0000) loss 8.8396 (7.9020) grad_norm 2.0597 (2.1978) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:29:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][40/625] eta 0:06:03 lr 0.000905 wd 0.0500 time 0.5629 (0.6211) data time 0.0008 (0.0095) model time 0.0000 (0.0000) loss 7.9115 (7.8583) grad_norm 2.6299 (2.2339) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:29:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][50/625] eta 0:05:53 lr 0.000904 wd 0.0500 time 0.5655 (0.6144) data time 0.0009 (0.0078) model time 0.0000 (0.0000) loss 8.6449 (7.8799) grad_norm 1.6188 (2.2128) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:29:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][60/625] eta 0:05:43 lr 0.000904 wd 0.0500 time 0.5632 (0.6082) data time 0.0008 (0.0067) model time 0.5624 (0.5754) loss 8.8079 (7.9309) grad_norm 2.0854 (2.2154) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:29:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][70/625] eta 0:05:34 lr 0.000904 wd 0.0500 time 0.5612 (0.6036) data time 0.0009 (0.0059) model time 0.5603 (0.5749) loss 8.9184 (7.9220) grad_norm 3.0936 (2.2578) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:29:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][80/625] eta 0:05:27 lr 0.000904 wd 0.0500 time 0.5707 (0.6005) data time 0.0007 (0.0052) model time 0.5700 (0.5759) loss 8.2493 (7.9072) grad_norm 2.9115 (2.2932) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:29:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][90/625] eta 0:05:19 lr 0.000904 wd 0.0500 time 0.5625 (0.5980) data time 0.0008 (0.0048) model time 0.5617 (0.5762) loss 8.0139 (7.8735) grad_norm 2.1067 (2.2703) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:30:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][100/625] eta 0:05:12 lr 0.000904 wd 0.0500 time 0.5675 (0.5961) data time 0.0008 (0.0044) model time 0.5667 (0.5765) loss 7.0010 (7.9084) grad_norm 1.7703 (2.2776) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:30:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][110/625] eta 0:05:06 lr 0.000904 wd 0.0500 time 0.5687 (0.5945) data time 0.0007 (0.0040) model time 0.5680 (0.5767) loss 7.8143 (7.8720) grad_norm 2.3481 (2.2805) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:30:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][120/625] eta 0:04:59 lr 0.000904 wd 0.0500 time 0.5648 (0.5931) data time 0.0007 (0.0038) model time 0.5641 (0.5767) loss 7.9034 (7.9001) grad_norm 2.0972 (2.2686) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:30:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][130/625] eta 0:04:52 lr 0.000904 wd 0.0500 time 0.5662 (0.5919) data time 0.0006 (0.0035) model time 0.5656 (0.5767) loss 6.9642 (7.8759) grad_norm 1.7612 (2.2410) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:30:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][140/625] eta 0:04:46 lr 0.000904 wd 0.0500 time 0.5646 (0.5909) data time 0.0009 (0.0034) model time 0.5637 (0.5767) loss 7.4170 (7.8589) grad_norm 1.5178 (2.2222) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:30:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][150/625] eta 0:04:40 lr 0.000904 wd 0.0500 time 0.5645 (0.5901) data time 0.0008 (0.0032) model time 0.5637 (0.5768) loss 8.5598 (7.8597) grad_norm 2.5391 (2.1986) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:30:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][160/625] eta 0:04:33 lr 0.000903 wd 0.0500 time 0.5675 (0.5892) data time 0.0006 (0.0031) model time 0.5668 (0.5767) loss 7.1676 (7.8393) grad_norm 1.9759 (2.1946) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:30:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][170/625] eta 0:04:27 lr 0.000903 wd 0.0500 time 0.5695 (0.5886) data time 0.0008 (0.0029) model time 0.5687 (0.5767) loss 6.9822 (7.8239) grad_norm 1.9223 (2.1933) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:30:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][180/625] eta 0:04:21 lr 0.000903 wd 0.0500 time 0.5685 (0.5880) data time 0.0006 (0.0028) model time 0.5679 (0.5767) loss 8.2138 (7.8295) grad_norm 2.0330 (2.2336) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:30:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][190/625] eta 0:04:15 lr 0.000903 wd 0.0500 time 0.5678 (0.5877) data time 0.0009 (0.0027) model time 0.5669 (0.5772) loss 5.9680 (7.8136) grad_norm 2.0515 (2.2515) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:31:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][200/625] eta 0:04:09 lr 0.000903 wd 0.0500 time 0.5681 (0.5872) data time 0.0007 (0.0026) model time 0.5674 (0.5771) loss 8.5535 (7.8047) grad_norm 3.4225 (2.2924) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:31:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][210/625] eta 0:04:03 lr 0.000903 wd 0.0500 time 0.5708 (0.5868) data time 0.0008 (0.0025) model time 0.5700 (0.5771) loss 8.7743 (7.7861) grad_norm 2.9529 (2.3202) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:31:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][220/625] eta 0:03:57 lr 0.000903 wd 0.0500 time 0.5659 (0.5863) data time 0.0006 (0.0024) model time 0.5652 (0.5770) loss 7.6800 (7.7994) grad_norm 1.9623 (2.3083) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:31:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][230/625] eta 0:03:52 lr 0.000903 wd 0.0500 time 0.6945 (0.5893) data time 0.0006 (0.0024) model time 0.6938 (0.5813) loss 7.6686 (7.8064) grad_norm 1.8937 (2.2924) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:31:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][240/625] eta 0:03:47 lr 0.000903 wd 0.0500 time 0.5654 (0.5912) data time 0.0008 (0.0023) model time 0.5646 (0.5841) loss 9.8941 (7.8150) grad_norm 1.7560 (2.2787) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:31:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][250/625] eta 0:03:41 lr 0.000903 wd 0.0500 time 0.5651 (0.5919) data time 0.0006 (0.0022) model time 0.5645 (0.5853) loss 9.5434 (7.8166) grad_norm 1.5469 (2.2657) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:31:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][260/625] eta 0:03:35 lr 0.000903 wd 0.0500 time 0.5679 (0.5916) data time 0.0007 (0.0022) model time 0.5672 (0.5852) loss 7.2253 (7.7952) grad_norm 2.1490 (2.2537) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:31:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][270/625] eta 0:03:29 lr 0.000902 wd 0.0500 time 0.5693 (0.5910) data time 0.0009 (0.0021) model time 0.5684 (0.5847) loss 6.9912 (7.7822) grad_norm 1.8435 (2.2424) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:31:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][280/625] eta 0:03:23 lr 0.000902 wd 0.0500 time 0.5616 (0.5905) data time 0.0006 (0.0021) model time 0.5610 (0.5844) loss 7.5919 (7.7744) grad_norm 2.0766 (2.2312) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:31:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][290/625] eta 0:03:17 lr 0.000902 wd 0.0500 time 0.5620 (0.5901) data time 0.0006 (0.0021) model time 0.5614 (0.5841) loss 6.5087 (7.7672) grad_norm 2.1491 (2.2250) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:31:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][300/625] eta 0:03:11 lr 0.000902 wd 0.0500 time 0.5652 (0.5898) data time 0.0008 (0.0020) model time 0.5644 (0.5839) loss 6.3130 (7.7669) grad_norm 1.5099 (2.2237) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:32:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][310/625] eta 0:03:05 lr 0.000902 wd 0.0500 time 0.5652 (0.5894) data time 0.0008 (0.0020) model time 0.5645 (0.5836) loss 7.6812 (7.7692) grad_norm 2.2643 (2.2118) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:32:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][320/625] eta 0:02:59 lr 0.000902 wd 0.0500 time 0.5659 (0.5891) data time 0.0008 (0.0019) model time 0.5651 (0.5834) loss 7.2897 (7.7527) grad_norm 1.7250 (2.2033) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:32:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][330/625] eta 0:02:53 lr 0.000902 wd 0.0500 time 0.5639 (0.5887) data time 0.0006 (0.0019) model time 0.5633 (0.5832) loss 8.3070 (7.7407) grad_norm 2.7413 (2.2113) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:32:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][340/625] eta 0:02:47 lr 0.000902 wd 0.0500 time 0.5639 (0.5884) data time 0.0008 (0.0019) model time 0.5631 (0.5830) loss 7.5551 (7.7343) grad_norm 1.8843 (2.2116) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:32:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][350/625] eta 0:02:41 lr 0.000902 wd 0.0500 time 0.5675 (0.5881) data time 0.0008 (0.0019) model time 0.5668 (0.5827) loss 7.5439 (7.7214) grad_norm 1.5826 (2.2058) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:32:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][360/625] eta 0:02:35 lr 0.000902 wd 0.0500 time 0.5707 (0.5878) data time 0.0006 (0.0018) model time 0.5700 (0.5825) loss 6.7715 (7.7176) grad_norm 1.9514 (2.2000) loss_scale 4096.0000 (2093.3850) mem 22339MB +[2024-07-25 01:32:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][370/625] eta 0:02:29 lr 0.000902 wd 0.0500 time 0.5709 (0.5874) data time 0.0008 (0.0018) model time 0.5700 (0.5823) loss 7.4103 (7.7269) grad_norm 1.9790 (2.1893) loss_scale 4096.0000 (2147.3639) mem 22339MB +[2024-07-25 01:32:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][380/625] eta 0:02:23 lr 0.000901 wd 0.0500 time 0.5620 (0.5872) data time 0.0006 (0.0018) model time 0.5614 (0.5822) loss 8.1333 (7.7262) grad_norm 2.8638 (2.2014) loss_scale 4096.0000 (2198.5092) mem 22339MB +[2024-07-25 01:32:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][390/625] eta 0:02:17 lr 0.000901 wd 0.0500 time 0.5674 (0.5870) data time 0.0006 (0.0018) model time 0.5668 (0.5820) loss 6.1990 (7.7204) grad_norm 1.9600 (2.2058) loss_scale 4096.0000 (2247.0384) mem 22339MB +[2024-07-25 01:32:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][400/625] eta 0:02:12 lr 0.000901 wd 0.0500 time 0.5685 (0.5868) data time 0.0006 (0.0017) model time 0.5679 (0.5819) loss 7.1347 (7.7090) grad_norm 1.8908 (2.2008) loss_scale 4096.0000 (2293.1471) mem 22339MB +[2024-07-25 01:33:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][410/625] eta 0:02:06 lr 0.000901 wd 0.0500 time 0.5663 (0.5869) data time 0.0009 (0.0017) model time 0.5654 (0.5822) loss 8.5069 (7.7068) grad_norm 1.5766 (2.1966) loss_scale 4096.0000 (2337.0122) mem 22339MB +[2024-07-25 01:33:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][420/625] eta 0:02:00 lr 0.000901 wd 0.0500 time 0.5623 (0.5867) data time 0.0008 (0.0017) model time 0.5614 (0.5820) loss 8.6196 (7.7209) grad_norm 2.4520 (2.1997) loss_scale 4096.0000 (2378.7933) mem 22339MB +[2024-07-25 01:33:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][430/625] eta 0:01:54 lr 0.000901 wd 0.0500 time 0.5629 (0.5865) data time 0.0006 (0.0017) model time 0.5622 (0.5819) loss 6.9518 (7.7209) grad_norm 2.3373 (2.2080) loss_scale 4096.0000 (2418.6357) mem 22339MB +[2024-07-25 01:33:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][440/625] eta 0:01:48 lr 0.000901 wd 0.0500 time 0.5621 (0.5863) data time 0.0006 (0.0017) model time 0.5615 (0.5818) loss 7.1482 (7.7187) grad_norm 2.3834 (2.2136) loss_scale 4096.0000 (2456.6712) mem 22339MB +[2024-07-25 01:33:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][450/625] eta 0:01:42 lr 0.000901 wd 0.0500 time 0.7499 (0.5873) data time 0.0006 (0.0016) model time 0.7494 (0.5830) loss 8.3454 (7.7102) grad_norm 2.1914 (2.2143) loss_scale 4096.0000 (2493.0200) mem 22339MB +[2024-07-25 01:33:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][460/625] eta 0:01:37 lr 0.000901 wd 0.0500 time 0.7583 (0.5898) data time 0.0009 (0.0016) model time 0.7574 (0.5859) loss 8.4976 (7.7126) grad_norm 1.9418 (2.2082) loss_scale 4096.0000 (2527.7918) mem 22339MB +[2024-07-25 01:33:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][470/625] eta 0:01:31 lr 0.000901 wd 0.0500 time 0.5709 (0.5900) data time 0.0009 (0.0016) model time 0.5700 (0.5862) loss 7.9799 (7.7179) grad_norm 1.6374 (2.2086) loss_scale 4096.0000 (2561.0870) mem 22339MB +[2024-07-25 01:33:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][480/625] eta 0:01:25 lr 0.000900 wd 0.0500 time 0.5630 (0.5900) data time 0.0007 (0.0016) model time 0.5622 (0.5863) loss 6.6085 (7.7123) grad_norm 1.8799 (2.2032) loss_scale 4096.0000 (2592.9979) mem 22339MB +[2024-07-25 01:33:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][490/625] eta 0:01:19 lr 0.000900 wd 0.0500 time 0.5628 (0.5898) data time 0.0006 (0.0016) model time 0.5622 (0.5861) loss 6.5562 (7.7092) grad_norm 2.6858 (2.2052) loss_scale 4096.0000 (2623.6090) mem 22339MB +[2024-07-25 01:33:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][500/625] eta 0:01:13 lr 0.000900 wd 0.0500 time 0.5636 (0.5896) data time 0.0006 (0.0016) model time 0.5629 (0.5859) loss 8.4989 (7.7119) grad_norm 2.0801 (2.2017) loss_scale 4096.0000 (2652.9980) mem 22339MB +[2024-07-25 01:34:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][510/625] eta 0:01:07 lr 0.000900 wd 0.0500 time 0.5609 (0.5893) data time 0.0008 (0.0016) model time 0.5601 (0.5857) loss 9.0048 (7.7101) grad_norm 2.3726 (2.1997) loss_scale 4096.0000 (2681.2368) mem 22339MB +[2024-07-25 01:34:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][520/625] eta 0:01:01 lr 0.000900 wd 0.0500 time 0.5649 (0.5891) data time 0.0006 (0.0015) model time 0.5643 (0.5855) loss 6.4624 (7.7068) grad_norm 3.5510 (2.2063) loss_scale 4096.0000 (2708.3916) mem 22339MB +[2024-07-25 01:34:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][530/625] eta 0:00:55 lr 0.000900 wd 0.0500 time 0.5684 (0.5890) data time 0.0006 (0.0015) model time 0.5677 (0.5854) loss 8.5581 (7.7020) grad_norm 1.4925 (2.2074) loss_scale 4096.0000 (2734.5235) mem 22339MB +[2024-07-25 01:34:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][540/625] eta 0:00:50 lr 0.000900 wd 0.0500 time 0.5640 (0.5888) data time 0.0009 (0.0015) model time 0.5631 (0.5853) loss 8.5704 (7.7040) grad_norm 3.1544 (2.2122) loss_scale 4096.0000 (2759.6895) mem 22339MB +[2024-07-25 01:34:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][550/625] eta 0:00:44 lr 0.000900 wd 0.0500 time 0.5681 (0.5886) data time 0.0009 (0.0015) model time 0.5673 (0.5851) loss 6.7526 (7.7032) grad_norm 2.0727 (2.2117) loss_scale 4096.0000 (2783.9419) mem 22339MB +[2024-07-25 01:34:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][560/625] eta 0:00:38 lr 0.000900 wd 0.0500 time 0.5657 (0.5884) data time 0.0006 (0.0015) model time 0.5651 (0.5850) loss 8.3609 (7.7016) grad_norm 2.3065 (2.2213) loss_scale 4096.0000 (2807.3298) mem 22339MB +[2024-07-25 01:34:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][570/625] eta 0:00:32 lr 0.000900 wd 0.0500 time 0.5646 (0.5882) data time 0.0008 (0.0015) model time 0.5638 (0.5848) loss 8.0567 (7.6999) grad_norm 2.6551 (2.2198) loss_scale 4096.0000 (2829.8984) mem 22339MB +[2024-07-25 01:34:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][580/625] eta 0:00:26 lr 0.000900 wd 0.0500 time 0.5661 (0.5881) data time 0.0006 (0.0015) model time 0.5656 (0.5847) loss 8.3251 (7.7047) grad_norm 2.4831 (2.2230) loss_scale 4096.0000 (2851.6902) mem 22339MB +[2024-07-25 01:34:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][590/625] eta 0:00:20 lr 0.000899 wd 0.0500 time 0.5699 (0.5879) data time 0.0009 (0.0015) model time 0.5690 (0.5846) loss 7.6779 (7.7023) grad_norm 1.9032 (2.2217) loss_scale 4096.0000 (2872.7445) mem 22339MB +[2024-07-25 01:34:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][600/625] eta 0:00:14 lr 0.000899 wd 0.0500 time 0.5641 (0.5878) data time 0.0007 (0.0014) model time 0.5634 (0.5845) loss 9.3999 (7.6981) grad_norm 1.9059 (2.2159) loss_scale 4096.0000 (2893.0982) mem 22339MB +[2024-07-25 01:35:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][610/625] eta 0:00:08 lr 0.000899 wd 0.0500 time 0.5634 (0.5878) data time 0.0004 (0.0014) model time 0.5630 (0.5845) loss 8.4653 (7.7000) grad_norm 1.8292 (2.2141) loss_scale 4096.0000 (2912.7856) mem 22339MB +[2024-07-25 01:35:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [113/300][620/625] eta 0:00:02 lr 0.000899 wd 0.0500 time 0.7605 (0.5879) data time 0.0004 (0.0014) model time 0.7601 (0.5847) loss 6.2895 (7.6965) grad_norm 1.7155 (2.2084) loss_scale 4096.0000 (2931.8390) mem 22339MB +[2024-07-25 01:35:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 113 training takes 0:06:07 +[2024-07-25 01:35:09 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 01:35:11 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 01:35:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.470 (0.470) Loss 0.5332 (0.5332) Acc@1 88.916 (88.916) Acc@5 98.633 (98.633) Mem 22339MB +[2024-07-25 01:35:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.8828 (0.6830) Acc@1 78.857 (85.170) Acc@5 95.947 (97.434) Mem 22339MB +[2024-07-25 01:35:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.9492 (0.8017) Acc@1 76.904 (81.931) Acc@5 95.020 (96.136) Mem 22339MB +[2024-07-25 01:35:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.588 Acc@5 96.135 +[2024-07-25 01:35:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.6% +[2024-07-25 01:35:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.808 (0.808) Loss 0.5059 (0.5059) Acc@1 89.307 (89.307) Acc@5 98.633 (98.633) Mem 22339MB +[2024-07-25 01:35:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.188) Loss 0.8184 (0.6412) Acc@1 80.518 (86.004) Acc@5 96.094 (97.652) Mem 22339MB +[2024-07-25 01:35:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.158) Loss 0.9331 (0.7536) Acc@1 76.318 (82.757) Acc@5 95.215 (96.508) Mem 22339MB +[2024-07-25 01:35:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.436 Acc@5 96.509 +[2024-07-25 01:35:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.4% +[2024-07-25 01:35:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.44% +[2024-07-25 01:35:18 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 01:35:19 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 01:35:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][0/625] eta 0:09:11 lr 0.000899 wd 0.0500 time 0.8816 (0.8816) data time 0.3637 (0.3637) model time 0.0000 (0.0000) loss 8.4715 (8.4715) grad_norm 1.6869 (1.6869) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:35:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][10/625] eta 0:06:11 lr 0.000899 wd 0.0500 time 0.5662 (0.6045) data time 0.0007 (0.0337) model time 0.0000 (0.0000) loss 8.2128 (8.0202) grad_norm 1.7245 (2.0451) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:35:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][20/625] eta 0:05:57 lr 0.000899 wd 0.0500 time 0.5636 (0.5912) data time 0.0008 (0.0180) model time 0.0000 (0.0000) loss 8.6191 (7.8160) grad_norm 3.1314 (2.3546) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:35:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][30/625] eta 0:05:49 lr 0.000899 wd 0.0500 time 0.5649 (0.5870) data time 0.0008 (0.0125) model time 0.0000 (0.0000) loss 9.5415 (7.7157) grad_norm 1.8443 (2.3879) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:35:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][40/625] eta 0:05:43 lr 0.000899 wd 0.0500 time 0.6011 (0.5876) data time 0.0006 (0.0096) model time 0.0000 (0.0000) loss 7.5581 (7.6450) grad_norm 1.7319 (2.2659) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:35:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][50/625] eta 0:05:44 lr 0.000899 wd 0.0500 time 0.7589 (0.5988) data time 0.0008 (0.0079) model time 0.0000 (0.0000) loss 6.3290 (7.6464) grad_norm 1.8689 (2.2007) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:35:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][60/625] eta 0:05:41 lr 0.000899 wd 0.0500 time 0.5677 (0.6053) data time 0.0008 (0.0067) model time 0.5669 (0.6376) loss 5.9895 (7.6601) grad_norm 1.8163 (2.1506) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:36:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][70/625] eta 0:05:37 lr 0.000898 wd 0.0500 time 0.5655 (0.6080) data time 0.0006 (0.0059) model time 0.5649 (0.6306) loss 7.2312 (7.6231) grad_norm 1.6065 (2.0961) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:36:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][80/625] eta 0:05:29 lr 0.000898 wd 0.0500 time 0.5652 (0.6041) data time 0.0006 (0.0053) model time 0.5646 (0.6123) loss 7.0674 (7.6601) grad_norm 1.9705 (2.0788) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:36:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][90/625] eta 0:05:21 lr 0.000898 wd 0.0500 time 0.5613 (0.6011) data time 0.0006 (0.0048) model time 0.5606 (0.6032) loss 8.0880 (7.6763) grad_norm 2.0617 (2.0713) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:36:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][100/625] eta 0:05:14 lr 0.000898 wd 0.0500 time 0.5710 (0.5987) data time 0.0008 (0.0044) model time 0.5703 (0.5978) loss 8.2118 (7.6842) grad_norm 1.4622 (2.0395) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:36:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][110/625] eta 0:05:07 lr 0.000898 wd 0.0500 time 0.5617 (0.5974) data time 0.0007 (0.0041) model time 0.5610 (0.5954) loss 7.4297 (7.6711) grad_norm 1.9463 (2.0628) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:36:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][120/625] eta 0:05:01 lr 0.000898 wd 0.0500 time 0.5643 (0.5971) data time 0.0007 (0.0038) model time 0.5635 (0.5949) loss 5.5975 (7.6694) grad_norm 2.3604 (2.1097) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:36:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][130/625] eta 0:04:54 lr 0.000898 wd 0.0500 time 0.5660 (0.5957) data time 0.0008 (0.0036) model time 0.5653 (0.5928) loss 8.3009 (7.6628) grad_norm 2.0139 (2.1171) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:36:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][140/625] eta 0:04:48 lr 0.000898 wd 0.0500 time 0.5639 (0.5945) data time 0.0008 (0.0034) model time 0.5631 (0.5911) loss 6.3220 (7.6574) grad_norm 2.4700 (2.1267) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:36:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][150/625] eta 0:04:41 lr 0.000898 wd 0.0500 time 0.5652 (0.5936) data time 0.0007 (0.0033) model time 0.5645 (0.5901) loss 8.1827 (7.6831) grad_norm 2.2870 (2.1420) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:36:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][160/625] eta 0:04:36 lr 0.000898 wd 0.0500 time 0.5602 (0.5939) data time 0.0008 (0.0031) model time 0.5594 (0.5907) loss 7.8929 (7.6746) grad_norm 3.1527 (2.1617) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:37:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][170/625] eta 0:04:29 lr 0.000898 wd 0.0500 time 0.5659 (0.5930) data time 0.0008 (0.0030) model time 0.5651 (0.5896) loss 7.4250 (7.6817) grad_norm 1.8793 (2.1937) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:37:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][180/625] eta 0:04:23 lr 0.000897 wd 0.0500 time 0.5649 (0.5923) data time 0.0008 (0.0029) model time 0.5641 (0.5889) loss 8.2277 (7.7148) grad_norm 1.7835 (2.1873) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:37:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][190/625] eta 0:04:17 lr 0.000897 wd 0.0500 time 0.5653 (0.5916) data time 0.0010 (0.0028) model time 0.5644 (0.5880) loss 8.5983 (7.7134) grad_norm 2.4073 (2.1864) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:37:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][200/625] eta 0:04:11 lr 0.000897 wd 0.0500 time 0.5686 (0.5909) data time 0.0009 (0.0027) model time 0.5678 (0.5874) loss 6.5229 (7.7001) grad_norm 3.1775 (2.1765) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:37:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][210/625] eta 0:04:04 lr 0.000897 wd 0.0500 time 0.5613 (0.5903) data time 0.0008 (0.0026) model time 0.5605 (0.5867) loss 9.1384 (7.7027) grad_norm 1.7127 (2.1830) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:37:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][220/625] eta 0:03:58 lr 0.000897 wd 0.0500 time 0.5627 (0.5897) data time 0.0008 (0.0025) model time 0.5619 (0.5861) loss 7.5913 (7.7180) grad_norm 2.5188 (2.1891) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:37:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][230/625] eta 0:03:52 lr 0.000897 wd 0.0500 time 0.5656 (0.5892) data time 0.0009 (0.0025) model time 0.5648 (0.5855) loss 6.9910 (7.7120) grad_norm 2.1513 (2.2064) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:37:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][240/625] eta 0:03:46 lr 0.000897 wd 0.0500 time 0.5691 (0.5886) data time 0.0006 (0.0024) model time 0.5685 (0.5850) loss 6.8255 (7.7077) grad_norm 1.7596 (2.1961) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:37:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][250/625] eta 0:03:40 lr 0.000897 wd 0.0500 time 0.5649 (0.5882) data time 0.0009 (0.0023) model time 0.5640 (0.5846) loss 7.6845 (7.7187) grad_norm 1.7776 (2.1823) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:37:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][260/625] eta 0:03:34 lr 0.000897 wd 0.0500 time 0.5657 (0.5883) data time 0.0008 (0.0023) model time 0.5650 (0.5849) loss 7.9801 (7.7200) grad_norm 1.6915 (2.1731) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:37:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][270/625] eta 0:03:29 lr 0.000897 wd 0.0500 time 0.7097 (0.5906) data time 0.0008 (0.0022) model time 0.7089 (0.5878) loss 6.5466 (7.7187) grad_norm 1.6539 (2.1685) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:38:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][280/625] eta 0:03:24 lr 0.000897 wd 0.0500 time 0.5628 (0.5919) data time 0.0006 (0.0022) model time 0.5622 (0.5895) loss 6.7522 (7.7312) grad_norm 2.6415 (2.1770) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:38:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][290/625] eta 0:03:18 lr 0.000896 wd 0.0500 time 0.5666 (0.5926) data time 0.0008 (0.0021) model time 0.5658 (0.5903) loss 8.1936 (7.7432) grad_norm 2.6627 (2.1741) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:38:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][300/625] eta 0:03:12 lr 0.000896 wd 0.0500 time 0.5660 (0.5920) data time 0.0008 (0.0021) model time 0.5653 (0.5898) loss 6.8062 (7.7381) grad_norm 2.3532 (2.1721) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:38:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][310/625] eta 0:03:06 lr 0.000896 wd 0.0500 time 0.5637 (0.5916) data time 0.0008 (0.0020) model time 0.5629 (0.5893) loss 8.8610 (7.7345) grad_norm 1.9466 (2.1737) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:38:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][320/625] eta 0:03:00 lr 0.000896 wd 0.0500 time 0.5682 (0.5912) data time 0.0008 (0.0020) model time 0.5675 (0.5888) loss 7.3869 (7.7434) grad_norm 3.6641 (2.1850) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:38:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][330/625] eta 0:02:54 lr 0.000896 wd 0.0500 time 0.5678 (0.5908) data time 0.0006 (0.0020) model time 0.5672 (0.5884) loss 6.0713 (7.7280) grad_norm 1.9327 (2.1908) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:38:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][340/625] eta 0:02:48 lr 0.000896 wd 0.0500 time 0.5645 (0.5904) data time 0.0008 (0.0019) model time 0.5638 (0.5880) loss 7.9897 (7.7278) grad_norm 2.7081 (2.1891) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:38:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][350/625] eta 0:02:42 lr 0.000896 wd 0.0500 time 0.5634 (0.5900) data time 0.0006 (0.0019) model time 0.5628 (0.5876) loss 7.3186 (7.7223) grad_norm 1.6605 (2.1825) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:38:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][360/625] eta 0:02:36 lr 0.000896 wd 0.0500 time 0.5662 (0.5897) data time 0.0008 (0.0019) model time 0.5654 (0.5873) loss 8.4869 (7.7305) grad_norm 1.9636 (2.1725) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:38:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][370/625] eta 0:02:30 lr 0.000896 wd 0.0500 time 0.5691 (0.5894) data time 0.0006 (0.0018) model time 0.5685 (0.5870) loss 8.1705 (7.7370) grad_norm 2.1756 (2.1769) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:39:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][380/625] eta 0:02:24 lr 0.000896 wd 0.0500 time 0.5679 (0.5892) data time 0.0007 (0.0018) model time 0.5673 (0.5868) loss 7.1887 (7.7333) grad_norm 1.5847 (2.1710) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:39:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][390/625] eta 0:02:18 lr 0.000896 wd 0.0500 time 0.5719 (0.5890) data time 0.0006 (0.0018) model time 0.5713 (0.5866) loss 5.0878 (7.7268) grad_norm 3.4504 (2.1721) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:39:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][400/625] eta 0:02:12 lr 0.000895 wd 0.0500 time 0.5665 (0.5887) data time 0.0006 (0.0018) model time 0.5660 (0.5863) loss 6.4854 (7.7145) grad_norm 3.9454 (2.1838) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:39:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][410/625] eta 0:02:06 lr 0.000895 wd 0.0500 time 0.5648 (0.5884) data time 0.0007 (0.0017) model time 0.5641 (0.5861) loss 6.6405 (7.7108) grad_norm 1.6822 (2.1810) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:39:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][420/625] eta 0:02:00 lr 0.000895 wd 0.0500 time 0.5673 (0.5881) data time 0.0009 (0.0017) model time 0.5665 (0.5858) loss 8.9171 (7.7188) grad_norm 2.1052 (2.1768) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:39:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][430/625] eta 0:01:54 lr 0.000895 wd 0.0500 time 0.5734 (0.5879) data time 0.0008 (0.0017) model time 0.5727 (0.5856) loss 6.9471 (7.7121) grad_norm 2.0642 (2.1719) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:39:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][440/625] eta 0:01:48 lr 0.000895 wd 0.0500 time 0.5636 (0.5877) data time 0.0008 (0.0017) model time 0.5628 (0.5854) loss 7.4414 (7.7133) grad_norm 2.7549 (2.1752) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:39:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][450/625] eta 0:01:42 lr 0.000895 wd 0.0500 time 0.5649 (0.5874) data time 0.0008 (0.0017) model time 0.5641 (0.5851) loss 9.2410 (7.7143) grad_norm 2.5642 (2.1783) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:39:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][460/625] eta 0:01:36 lr 0.000895 wd 0.0500 time 0.5690 (0.5872) data time 0.0008 (0.0017) model time 0.5682 (0.5849) loss 6.0772 (7.7097) grad_norm 2.8417 (2.1834) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:39:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][470/625] eta 0:01:30 lr 0.000895 wd 0.0500 time 0.5671 (0.5871) data time 0.0006 (0.0016) model time 0.5665 (0.5848) loss 6.7542 (7.7027) grad_norm 1.7121 (2.1991) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 01:40:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][480/625] eta 0:01:25 lr 0.000895 wd 0.0500 time 0.5655 (0.5871) data time 0.0008 (0.0016) model time 0.5648 (0.5849) loss 7.8543 (7.7027) grad_norm 2.8573 (inf) loss_scale 2048.0000 (4066.1954) mem 22339MB +[2024-07-25 01:40:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][490/625] eta 0:01:19 lr 0.000895 wd 0.0500 time 0.7306 (0.5886) data time 0.0007 (0.0016) model time 0.7299 (0.5866) loss 7.8609 (7.7139) grad_norm 2.1783 (inf) loss_scale 2048.0000 (4025.0916) mem 22339MB +[2024-07-25 01:40:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][500/625] eta 0:01:13 lr 0.000894 wd 0.0500 time 0.5637 (0.5893) data time 0.0008 (0.0016) model time 0.5629 (0.5873) loss 5.8533 (7.7122) grad_norm 2.2571 (inf) loss_scale 2048.0000 (3985.6287) mem 22339MB +[2024-07-25 01:40:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][510/625] eta 0:01:07 lr 0.000894 wd 0.0500 time 0.5681 (0.5894) data time 0.0008 (0.0016) model time 0.5673 (0.5875) loss 7.4854 (7.7159) grad_norm 1.8658 (inf) loss_scale 2048.0000 (3947.7104) mem 22339MB +[2024-07-25 01:40:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][520/625] eta 0:01:01 lr 0.000894 wd 0.0500 time 0.5642 (0.5892) data time 0.0008 (0.0016) model time 0.5634 (0.5873) loss 8.4287 (7.7259) grad_norm 2.1335 (inf) loss_scale 2048.0000 (3911.2476) mem 22339MB +[2024-07-25 01:40:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][530/625] eta 0:00:55 lr 0.000894 wd 0.0500 time 0.5664 (0.5890) data time 0.0007 (0.0016) model time 0.5657 (0.5870) loss 5.4389 (7.7213) grad_norm 1.9704 (inf) loss_scale 2048.0000 (3876.1582) mem 22339MB +[2024-07-25 01:40:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][540/625] eta 0:00:50 lr 0.000894 wd 0.0500 time 0.5644 (0.5888) data time 0.0008 (0.0015) model time 0.5636 (0.5868) loss 9.1059 (7.7228) grad_norm 1.6305 (inf) loss_scale 2048.0000 (3842.3660) mem 22339MB +[2024-07-25 01:40:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][550/625] eta 0:00:44 lr 0.000894 wd 0.0500 time 0.5627 (0.5885) data time 0.0006 (0.0015) model time 0.5621 (0.5866) loss 7.3610 (7.7266) grad_norm 2.7803 (inf) loss_scale 2048.0000 (3809.8004) mem 22339MB +[2024-07-25 01:40:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][560/625] eta 0:00:38 lr 0.000894 wd 0.0500 time 0.5674 (0.5884) data time 0.0008 (0.0015) model time 0.5666 (0.5864) loss 6.4790 (7.7240) grad_norm 2.1849 (inf) loss_scale 2048.0000 (3778.3957) mem 22339MB +[2024-07-25 01:40:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][570/625] eta 0:00:32 lr 0.000894 wd 0.0500 time 0.5641 (0.5882) data time 0.0007 (0.0015) model time 0.5634 (0.5863) loss 7.8699 (7.7300) grad_norm 2.5722 (inf) loss_scale 2048.0000 (3748.0911) mem 22339MB +[2024-07-25 01:41:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][580/625] eta 0:00:26 lr 0.000894 wd 0.0500 time 0.5655 (0.5880) data time 0.0006 (0.0015) model time 0.5649 (0.5861) loss 6.1088 (7.7286) grad_norm 2.9468 (inf) loss_scale 2048.0000 (3718.8296) mem 22339MB +[2024-07-25 01:41:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][590/625] eta 0:00:20 lr 0.000894 wd 0.0500 time 0.5669 (0.5878) data time 0.0008 (0.0015) model time 0.5661 (0.5859) loss 6.2414 (7.7287) grad_norm 2.5909 (inf) loss_scale 2048.0000 (3690.5584) mem 22339MB +[2024-07-25 01:41:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][600/625] eta 0:00:14 lr 0.000894 wd 0.0500 time 0.5634 (0.5878) data time 0.0008 (0.0015) model time 0.5626 (0.5859) loss 7.3303 (7.7239) grad_norm 1.8707 (inf) loss_scale 2048.0000 (3663.2280) mem 22339MB +[2024-07-25 01:41:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][610/625] eta 0:00:08 lr 0.000893 wd 0.0500 time 0.5701 (0.5877) data time 0.0004 (0.0015) model time 0.5697 (0.5858) loss 6.6894 (7.7206) grad_norm 1.7121 (inf) loss_scale 2048.0000 (3636.7921) mem 22339MB +[2024-07-25 01:41:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [114/300][620/625] eta 0:00:02 lr 0.000893 wd 0.0500 time 0.5643 (0.5875) data time 0.0004 (0.0015) model time 0.5639 (0.5856) loss 8.9477 (7.7213) grad_norm 2.6342 (inf) loss_scale 2048.0000 (3611.2077) mem 22339MB +[2024-07-25 01:41:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 114 training takes 0:06:07 +[2024-07-25 01:41:26 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 01:41:28 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 01:41:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.472 (0.472) Loss 0.5337 (0.5337) Acc@1 88.086 (88.086) Acc@5 98.389 (98.389) Mem 22339MB +[2024-07-25 01:41:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.8677 (0.6681) Acc@1 79.443 (85.267) Acc@5 95.654 (97.448) Mem 22339MB +[2024-07-25 01:41:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.9692 (0.7897) Acc@1 75.830 (81.978) Acc@5 94.678 (96.126) Mem 22339MB +[2024-07-25 01:41:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.636 Acc@5 96.091 +[2024-07-25 01:41:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.6% +[2024-07-25 01:41:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.885 (0.885) Loss 0.5049 (0.5049) Acc@1 89.307 (89.307) Acc@5 98.633 (98.633) Mem 22339MB +[2024-07-25 01:41:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.195) Loss 0.8174 (0.6403) Acc@1 80.566 (86.013) Acc@5 96.094 (97.652) Mem 22339MB +[2024-07-25 01:41:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.162) Loss 0.9316 (0.7525) Acc@1 76.367 (82.771) Acc@5 95.312 (96.508) Mem 22339MB +[2024-07-25 01:41:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.452 Acc@5 96.507 +[2024-07-25 01:41:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.5% +[2024-07-25 01:41:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.45% +[2024-07-25 01:41:35 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 01:41:37 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 01:41:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][0/625] eta 0:09:26 lr 0.000893 wd 0.0500 time 0.9063 (0.9063) data time 0.3823 (0.3823) model time 0.0000 (0.0000) loss 8.4219 (8.4219) grad_norm 1.9636 (1.9636) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:41:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][10/625] eta 0:06:13 lr 0.000893 wd 0.0500 time 0.5711 (0.6078) data time 0.0006 (0.0355) model time 0.0000 (0.0000) loss 7.6272 (7.3622) grad_norm 2.5090 (2.1581) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:41:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][20/625] eta 0:05:59 lr 0.000893 wd 0.0500 time 0.5619 (0.5939) data time 0.0006 (0.0190) model time 0.0000 (0.0000) loss 6.5080 (7.3222) grad_norm 2.4005 (2.1155) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:41:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][30/625] eta 0:05:50 lr 0.000893 wd 0.0500 time 0.5657 (0.5885) data time 0.0008 (0.0131) model time 0.0000 (0.0000) loss 7.8689 (7.4436) grad_norm 2.7724 (2.1157) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:42:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][40/625] eta 0:05:42 lr 0.000893 wd 0.0500 time 0.5621 (0.5856) data time 0.0008 (0.0101) model time 0.0000 (0.0000) loss 8.5485 (7.4516) grad_norm 2.2585 (2.1535) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:42:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][50/625] eta 0:05:35 lr 0.000893 wd 0.0500 time 0.5672 (0.5838) data time 0.0008 (0.0083) model time 0.0000 (0.0000) loss 7.6810 (7.5172) grad_norm 1.8128 (2.1331) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:42:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][60/625] eta 0:05:29 lr 0.000893 wd 0.0500 time 0.5629 (0.5828) data time 0.0007 (0.0071) model time 0.5621 (0.5770) loss 7.7289 (7.4507) grad_norm 1.8841 (2.1493) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:42:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][70/625] eta 0:05:24 lr 0.000893 wd 0.0500 time 0.7349 (0.5844) data time 0.0010 (0.0062) model time 0.7339 (0.5852) loss 8.2282 (7.4853) grad_norm 1.7560 (2.2122) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:42:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][80/625] eta 0:05:20 lr 0.000893 wd 0.0500 time 0.7218 (0.5875) data time 0.0007 (0.0055) model time 0.7211 (0.5931) loss 9.2717 (7.5393) grad_norm 2.5376 (2.2374) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:42:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][90/625] eta 0:05:19 lr 0.000892 wd 0.0500 time 0.6203 (0.5963) data time 0.0009 (0.0051) model time 0.6193 (0.6114) loss 9.3260 (7.5718) grad_norm 1.9496 (2.2170) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:42:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][100/625] eta 0:05:12 lr 0.000892 wd 0.0500 time 0.5660 (0.5954) data time 0.0006 (0.0047) model time 0.5654 (0.6064) loss 6.3454 (7.5473) grad_norm 3.4811 (2.2115) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:42:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][110/625] eta 0:05:07 lr 0.000892 wd 0.0500 time 0.5696 (0.5966) data time 0.0006 (0.0043) model time 0.5690 (0.6065) loss 8.6290 (7.6161) grad_norm 1.7147 (2.2098) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:42:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][120/625] eta 0:05:00 lr 0.000892 wd 0.0500 time 0.5655 (0.5955) data time 0.0006 (0.0040) model time 0.5649 (0.6032) loss 6.5440 (7.6230) grad_norm 2.7159 (2.2154) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:42:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][130/625] eta 0:04:54 lr 0.000892 wd 0.0500 time 0.5693 (0.5956) data time 0.0008 (0.0038) model time 0.5685 (0.6023) loss 8.0677 (7.6610) grad_norm 2.0057 (2.2353) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:43:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][140/625] eta 0:04:48 lr 0.000892 wd 0.0500 time 0.5684 (0.5944) data time 0.0007 (0.0036) model time 0.5678 (0.5994) loss 6.3147 (7.6319) grad_norm 2.0118 (2.2363) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:43:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][150/625] eta 0:04:41 lr 0.000892 wd 0.0500 time 0.5629 (0.5933) data time 0.0006 (0.0034) model time 0.5623 (0.5973) loss 5.7985 (7.6334) grad_norm 2.4101 (2.2454) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:43:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][160/625] eta 0:04:35 lr 0.000892 wd 0.0500 time 0.5645 (0.5924) data time 0.0006 (0.0033) model time 0.5640 (0.5955) loss 9.0360 (7.6143) grad_norm 2.9359 (2.2609) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:43:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][170/625] eta 0:04:29 lr 0.000892 wd 0.0500 time 0.5633 (0.5915) data time 0.0007 (0.0031) model time 0.5626 (0.5939) loss 7.5640 (7.6113) grad_norm 2.5781 (2.2503) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:43:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][180/625] eta 0:04:22 lr 0.000892 wd 0.0500 time 0.5663 (0.5908) data time 0.0008 (0.0030) model time 0.5655 (0.5926) loss 8.6099 (7.6374) grad_norm 2.0504 (2.2326) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:43:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][190/625] eta 0:04:16 lr 0.000892 wd 0.0500 time 0.5678 (0.5901) data time 0.0008 (0.0029) model time 0.5669 (0.5914) loss 8.1677 (7.6454) grad_norm 1.8963 (2.2200) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:43:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][200/625] eta 0:04:10 lr 0.000891 wd 0.0500 time 0.5652 (0.5895) data time 0.0008 (0.0028) model time 0.5644 (0.5905) loss 8.4716 (7.6577) grad_norm 1.6486 (2.2067) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:43:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][210/625] eta 0:04:04 lr 0.000891 wd 0.0500 time 0.5634 (0.5890) data time 0.0006 (0.0027) model time 0.5629 (0.5897) loss 8.5259 (7.6633) grad_norm 1.8168 (2.1970) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:43:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][220/625] eta 0:03:58 lr 0.000891 wd 0.0500 time 0.5685 (0.5885) data time 0.0008 (0.0026) model time 0.5677 (0.5890) loss 8.7752 (7.6757) grad_norm 2.2775 (2.1905) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:43:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][230/625] eta 0:03:52 lr 0.000891 wd 0.0500 time 0.5707 (0.5881) data time 0.0008 (0.0026) model time 0.5699 (0.5884) loss 8.0691 (7.6635) grad_norm 1.8785 (2.1838) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:43:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][240/625] eta 0:03:46 lr 0.000891 wd 0.0500 time 0.5624 (0.5876) data time 0.0007 (0.0025) model time 0.5617 (0.5877) loss 8.2030 (7.6575) grad_norm 2.5010 (2.1719) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:44:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][250/625] eta 0:03:40 lr 0.000891 wd 0.0500 time 0.5672 (0.5872) data time 0.0008 (0.0024) model time 0.5664 (0.5871) loss 7.7695 (7.6762) grad_norm 1.8621 (2.1690) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:44:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][260/625] eta 0:03:34 lr 0.000891 wd 0.0500 time 0.5662 (0.5868) data time 0.0008 (0.0024) model time 0.5654 (0.5866) loss 8.1646 (7.6867) grad_norm 3.4269 (2.1707) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:44:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][270/625] eta 0:03:28 lr 0.000891 wd 0.0500 time 0.5647 (0.5866) data time 0.0006 (0.0023) model time 0.5641 (0.5863) loss 6.9443 (7.6836) grad_norm 1.8376 (2.1670) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:44:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][280/625] eta 0:03:22 lr 0.000891 wd 0.0500 time 0.5626 (0.5867) data time 0.0006 (0.0023) model time 0.5620 (0.5865) loss 8.2095 (7.6907) grad_norm 1.9382 (2.1700) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:44:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][290/625] eta 0:03:16 lr 0.000891 wd 0.0500 time 0.5642 (0.5869) data time 0.0009 (0.0023) model time 0.5634 (0.5866) loss 7.2289 (7.6802) grad_norm 2.6844 (2.1815) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:44:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][300/625] eta 0:03:11 lr 0.000891 wd 0.0500 time 0.7595 (0.5883) data time 0.0006 (0.0022) model time 0.7589 (0.5883) loss 6.3444 (7.6683) grad_norm 2.1453 (2.1778) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:44:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][310/625] eta 0:03:05 lr 0.000890 wd 0.0500 time 0.5600 (0.5901) data time 0.0008 (0.0022) model time 0.5592 (0.5904) loss 8.2333 (7.6633) grad_norm 1.9394 (2.1755) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:44:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][320/625] eta 0:03:00 lr 0.000890 wd 0.0500 time 0.5927 (0.5903) data time 0.0006 (0.0021) model time 0.5920 (0.5906) loss 6.0863 (7.6516) grad_norm 1.8008 (2.1787) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:44:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][330/625] eta 0:02:54 lr 0.000890 wd 0.0500 time 0.5664 (0.5907) data time 0.0008 (0.0021) model time 0.5656 (0.5911) loss 8.9670 (7.6439) grad_norm 2.2712 (2.1798) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:44:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][340/625] eta 0:02:48 lr 0.000890 wd 0.0500 time 0.5621 (0.5904) data time 0.0008 (0.0020) model time 0.5613 (0.5906) loss 7.9378 (7.6469) grad_norm 1.7108 (2.1730) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:45:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][350/625] eta 0:02:42 lr 0.000890 wd 0.0500 time 0.5705 (0.5904) data time 0.0006 (0.0020) model time 0.5698 (0.5906) loss 8.2308 (7.6513) grad_norm 2.2480 (2.1618) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:45:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][360/625] eta 0:02:36 lr 0.000890 wd 0.0500 time 0.5668 (0.5901) data time 0.0006 (0.0020) model time 0.5662 (0.5902) loss 6.5835 (7.6280) grad_norm 2.6140 (2.1773) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:45:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][370/625] eta 0:02:30 lr 0.000890 wd 0.0500 time 0.5647 (0.5898) data time 0.0006 (0.0020) model time 0.5640 (0.5898) loss 8.3548 (7.6336) grad_norm 1.9454 (2.1753) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:45:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][380/625] eta 0:02:24 lr 0.000890 wd 0.0500 time 0.5673 (0.5894) data time 0.0007 (0.0019) model time 0.5667 (0.5894) loss 6.3071 (7.6414) grad_norm 2.2834 (2.1734) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:45:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][390/625] eta 0:02:18 lr 0.000890 wd 0.0500 time 0.5678 (0.5892) data time 0.0006 (0.0019) model time 0.5672 (0.5891) loss 7.8493 (7.6363) grad_norm 2.0459 (2.1746) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:45:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][400/625] eta 0:02:12 lr 0.000890 wd 0.0500 time 0.5634 (0.5889) data time 0.0006 (0.0019) model time 0.5628 (0.5887) loss 6.4368 (7.6349) grad_norm 2.7092 (2.1916) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:45:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][410/625] eta 0:02:06 lr 0.000889 wd 0.0500 time 0.5677 (0.5887) data time 0.0006 (0.0018) model time 0.5670 (0.5885) loss 7.8463 (7.6376) grad_norm 1.7103 (2.1960) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:45:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][420/625] eta 0:02:00 lr 0.000889 wd 0.0500 time 0.5712 (0.5886) data time 0.0008 (0.0018) model time 0.5704 (0.5883) loss 8.5701 (7.6401) grad_norm 2.1358 (2.1947) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:45:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][430/625] eta 0:01:54 lr 0.000889 wd 0.0500 time 0.5716 (0.5884) data time 0.0008 (0.0018) model time 0.5708 (0.5881) loss 7.1750 (7.6341) grad_norm 2.1907 (2.1941) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:45:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][440/625] eta 0:01:48 lr 0.000889 wd 0.0500 time 0.5654 (0.5882) data time 0.0008 (0.0018) model time 0.5646 (0.5878) loss 9.2426 (7.6364) grad_norm 3.2254 (2.1943) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:46:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][450/625] eta 0:01:42 lr 0.000889 wd 0.0500 time 0.5674 (0.5881) data time 0.0008 (0.0018) model time 0.5666 (0.5878) loss 8.8453 (7.6428) grad_norm 1.8492 (2.2048) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:46:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][460/625] eta 0:01:37 lr 0.000889 wd 0.0500 time 0.5643 (0.5879) data time 0.0010 (0.0018) model time 0.5633 (0.5875) loss 8.8404 (7.6546) grad_norm 2.0385 (2.1970) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:46:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][470/625] eta 0:01:31 lr 0.000889 wd 0.0500 time 0.5626 (0.5878) data time 0.0008 (0.0017) model time 0.5618 (0.5873) loss 7.3421 (7.6534) grad_norm 2.7312 (2.1947) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:46:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][480/625] eta 0:01:25 lr 0.000889 wd 0.0500 time 0.5630 (0.5877) data time 0.0006 (0.0017) model time 0.5624 (0.5872) loss 6.3813 (7.6496) grad_norm 2.0448 (2.1917) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:46:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][490/625] eta 0:01:19 lr 0.000889 wd 0.0500 time 0.5643 (0.5877) data time 0.0009 (0.0017) model time 0.5634 (0.5872) loss 6.3032 (7.6539) grad_norm 1.5080 (2.1874) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:46:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][500/625] eta 0:01:13 lr 0.000889 wd 0.0500 time 0.5651 (0.5877) data time 0.0008 (0.0017) model time 0.5643 (0.5872) loss 6.7684 (7.6499) grad_norm 2.1075 (2.1872) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:46:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][510/625] eta 0:01:07 lr 0.000889 wd 0.0500 time 0.5644 (0.5875) data time 0.0008 (0.0017) model time 0.5636 (0.5870) loss 8.6812 (7.6581) grad_norm 1.9730 (2.1883) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:46:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][520/625] eta 0:01:01 lr 0.000888 wd 0.0500 time 0.7145 (0.5884) data time 0.0006 (0.0017) model time 0.7139 (0.5879) loss 8.8970 (7.6696) grad_norm 2.0028 (2.1961) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:46:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][530/625] eta 0:00:55 lr 0.000888 wd 0.0500 time 0.5667 (0.5894) data time 0.0008 (0.0017) model time 0.5659 (0.5890) loss 8.7398 (7.6691) grad_norm 1.5967 (2.1894) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:46:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][540/625] eta 0:00:50 lr 0.000888 wd 0.0500 time 0.7030 (0.5896) data time 0.0007 (0.0016) model time 0.7023 (0.5893) loss 8.5180 (7.6739) grad_norm 2.9394 (2.1951) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:47:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][550/625] eta 0:00:44 lr 0.000888 wd 0.0500 time 0.5636 (0.5900) data time 0.0007 (0.0016) model time 0.5629 (0.5897) loss 7.6761 (7.6648) grad_norm 2.1775 (2.1969) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:47:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][560/625] eta 0:00:38 lr 0.000888 wd 0.0500 time 0.5717 (0.5898) data time 0.0006 (0.0016) model time 0.5711 (0.5895) loss 6.9776 (7.6579) grad_norm 1.8275 (2.1936) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:47:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][570/625] eta 0:00:32 lr 0.000888 wd 0.0500 time 0.5692 (0.5897) data time 0.0006 (0.0016) model time 0.5686 (0.5894) loss 8.3096 (7.6680) grad_norm 1.9294 (2.1949) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:47:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][580/625] eta 0:00:26 lr 0.000888 wd 0.0500 time 0.5635 (0.5896) data time 0.0006 (0.0016) model time 0.5630 (0.5892) loss 7.4982 (7.6733) grad_norm 1.8805 (2.2020) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:47:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][590/625] eta 0:00:20 lr 0.000888 wd 0.0500 time 0.5640 (0.5894) data time 0.0008 (0.0016) model time 0.5632 (0.5890) loss 7.3440 (7.6678) grad_norm 5.2485 (2.2090) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:47:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][600/625] eta 0:00:14 lr 0.000888 wd 0.0500 time 0.5621 (0.5892) data time 0.0007 (0.0016) model time 0.5614 (0.5888) loss 8.2269 (7.6723) grad_norm 1.8244 (2.2091) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:47:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][610/625] eta 0:00:08 lr 0.000888 wd 0.0500 time 0.5634 (0.5891) data time 0.0004 (0.0016) model time 0.5630 (0.5886) loss 6.2031 (7.6677) grad_norm 1.7681 (2.2078) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:47:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [115/300][620/625] eta 0:00:02 lr 0.000888 wd 0.0500 time 0.5634 (0.5889) data time 0.0006 (0.0015) model time 0.5628 (0.5884) loss 6.4480 (7.6621) grad_norm 1.7028 (2.2037) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:47:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 115 training takes 0:06:08 +[2024-07-25 01:47:45 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 01:47:46 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 01:47:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.473 (0.473) Loss 0.5229 (0.5229) Acc@1 89.355 (89.355) Acc@5 98.145 (98.145) Mem 22339MB +[2024-07-25 01:47:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.157) Loss 0.8516 (0.6594) Acc@1 79.980 (85.347) Acc@5 95.752 (97.443) Mem 22339MB +[2024-07-25 01:47:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.9722 (0.7846) Acc@1 75.977 (81.920) Acc@5 94.189 (96.154) Mem 22339MB +[2024-07-25 01:47:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.604 Acc@5 96.183 +[2024-07-25 01:47:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.6% +[2024-07-25 01:47:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.803 (0.803) Loss 0.5044 (0.5044) Acc@1 89.307 (89.307) Acc@5 98.682 (98.682) Mem 22339MB +[2024-07-25 01:47:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.187) Loss 0.8164 (0.6396) Acc@1 80.469 (86.066) Acc@5 96.094 (97.656) Mem 22339MB +[2024-07-25 01:47:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.158) Loss 0.9302 (0.7515) Acc@1 76.465 (82.824) Acc@5 95.361 (96.510) Mem 22339MB +[2024-07-25 01:47:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.496 Acc@5 96.515 +[2024-07-25 01:47:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.5% +[2024-07-25 01:47:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.50% +[2024-07-25 01:47:53 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 01:47:55 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 01:47:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][0/625] eta 0:08:58 lr 0.000887 wd 0.0500 time 0.8616 (0.8616) data time 0.3441 (0.3441) model time 0.0000 (0.0000) loss 7.9791 (7.9791) grad_norm 1.7971 (1.7971) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:48:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][10/625] eta 0:06:11 lr 0.000887 wd 0.0500 time 0.5617 (0.6038) data time 0.0006 (0.0320) model time 0.0000 (0.0000) loss 8.3162 (7.6666) grad_norm 1.8360 (2.1847) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:48:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][20/625] eta 0:05:58 lr 0.000887 wd 0.0500 time 0.5691 (0.5925) data time 0.0006 (0.0172) model time 0.0000 (0.0000) loss 7.1565 (7.5135) grad_norm 1.9746 (2.0375) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:48:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][30/625] eta 0:05:49 lr 0.000887 wd 0.0500 time 0.5668 (0.5874) data time 0.0006 (0.0119) model time 0.0000 (0.0000) loss 7.5795 (7.4730) grad_norm 1.7184 (1.9934) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:48:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][40/625] eta 0:05:42 lr 0.000887 wd 0.0500 time 0.5650 (0.5847) data time 0.0008 (0.0092) model time 0.0000 (0.0000) loss 6.6452 (7.5427) grad_norm 2.7812 (2.0958) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:48:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][50/625] eta 0:05:35 lr 0.000887 wd 0.0500 time 0.5616 (0.5836) data time 0.0008 (0.0076) model time 0.0000 (0.0000) loss 8.5180 (7.5514) grad_norm 2.4223 (2.1599) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:48:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][60/625] eta 0:05:29 lr 0.000887 wd 0.0500 time 0.5675 (0.5828) data time 0.0008 (0.0065) model time 0.5667 (0.5780) loss 6.2532 (7.5402) grad_norm 2.1289 (2.1528) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 01:48:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][70/625] eta 0:05:23 lr 0.000887 wd 0.0500 time 0.5644 (0.5833) data time 0.0006 (0.0057) model time 0.5638 (0.5817) loss 8.0576 (7.6092) grad_norm 2.2582 (inf) loss_scale 1024.0000 (2004.7324) mem 22339MB +[2024-07-25 01:48:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][80/625] eta 0:05:17 lr 0.000887 wd 0.0500 time 0.5633 (0.5833) data time 0.0007 (0.0056) model time 0.5626 (0.5808) loss 8.3434 (7.6190) grad_norm 2.2746 (inf) loss_scale 1024.0000 (1883.6543) mem 22339MB +[2024-07-25 01:48:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][90/625] eta 0:05:12 lr 0.000887 wd 0.0500 time 0.5651 (0.5832) data time 0.0008 (0.0052) model time 0.5643 (0.5806) loss 7.1336 (7.6327) grad_norm 1.6557 (inf) loss_scale 1024.0000 (1789.1868) mem 22339MB +[2024-07-25 01:48:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][100/625] eta 0:05:06 lr 0.000887 wd 0.0500 time 0.5194 (0.5844) data time 0.0007 (0.0048) model time 0.5188 (0.5834) loss 8.7762 (7.6039) grad_norm 3.0082 (inf) loss_scale 1024.0000 (1713.4257) mem 22339MB +[2024-07-25 01:49:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][110/625] eta 0:05:01 lr 0.000886 wd 0.0500 time 0.5659 (0.5845) data time 0.0007 (0.0044) model time 0.5652 (0.5836) loss 8.8523 (7.6060) grad_norm 2.2485 (inf) loss_scale 1024.0000 (1651.3153) mem 22339MB +[2024-07-25 01:49:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][120/625] eta 0:04:58 lr 0.000886 wd 0.0500 time 0.6891 (0.5906) data time 0.0006 (0.0041) model time 0.6885 (0.5941) loss 6.9174 (7.5993) grad_norm 2.4458 (inf) loss_scale 1024.0000 (1599.4711) mem 22339MB +[2024-07-25 01:49:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][130/625] eta 0:04:54 lr 0.000886 wd 0.0500 time 0.6963 (0.5944) data time 0.0006 (0.0040) model time 0.6957 (0.5997) loss 6.3280 (7.6116) grad_norm 2.4040 (inf) loss_scale 1024.0000 (1555.5420) mem 22339MB +[2024-07-25 01:49:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][140/625] eta 0:04:50 lr 0.000886 wd 0.0500 time 0.7258 (0.5989) data time 0.0008 (0.0038) model time 0.7250 (0.6059) loss 8.7398 (7.6040) grad_norm 2.2767 (inf) loss_scale 1024.0000 (1517.8440) mem 22339MB +[2024-07-25 01:49:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][150/625] eta 0:04:44 lr 0.000886 wd 0.0500 time 0.5659 (0.5987) data time 0.0008 (0.0036) model time 0.5651 (0.6048) loss 7.4677 (7.5821) grad_norm 2.5660 (inf) loss_scale 1024.0000 (1485.1391) mem 22339MB +[2024-07-25 01:49:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][160/625] eta 0:04:37 lr 0.000886 wd 0.0500 time 0.5632 (0.5975) data time 0.0008 (0.0035) model time 0.5625 (0.6024) loss 8.9423 (7.5978) grad_norm 2.0823 (inf) loss_scale 1024.0000 (1456.4969) mem 22339MB +[2024-07-25 01:49:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][170/625] eta 0:04:31 lr 0.000886 wd 0.0500 time 0.5637 (0.5963) data time 0.0007 (0.0033) model time 0.5630 (0.6002) loss 7.9381 (7.5943) grad_norm 2.0975 (inf) loss_scale 1024.0000 (1431.2047) mem 22339MB +[2024-07-25 01:49:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][180/625] eta 0:04:24 lr 0.000886 wd 0.0500 time 0.5623 (0.5953) data time 0.0006 (0.0032) model time 0.5617 (0.5985) loss 7.8467 (7.5901) grad_norm 2.1041 (inf) loss_scale 1024.0000 (1408.7072) mem 22339MB +[2024-07-25 01:49:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][190/625] eta 0:04:18 lr 0.000886 wd 0.0500 time 0.5670 (0.5944) data time 0.0006 (0.0030) model time 0.5664 (0.5970) loss 8.4901 (7.6110) grad_norm 1.9870 (inf) loss_scale 1024.0000 (1388.5654) mem 22339MB +[2024-07-25 01:49:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][200/625] eta 0:04:12 lr 0.000886 wd 0.0500 time 0.5631 (0.5937) data time 0.0008 (0.0029) model time 0.5623 (0.5957) loss 9.0045 (7.6170) grad_norm 2.1978 (inf) loss_scale 1024.0000 (1370.4279) mem 22339MB +[2024-07-25 01:50:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][210/625] eta 0:04:06 lr 0.000886 wd 0.0500 time 0.5638 (0.5933) data time 0.0006 (0.0029) model time 0.5631 (0.5951) loss 8.9202 (7.6216) grad_norm 2.0909 (inf) loss_scale 1024.0000 (1354.0095) mem 22339MB +[2024-07-25 01:50:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][220/625] eta 0:04:00 lr 0.000885 wd 0.0500 time 0.5625 (0.5930) data time 0.0006 (0.0028) model time 0.5618 (0.5945) loss 8.1372 (7.6202) grad_norm 2.8451 (inf) loss_scale 1024.0000 (1339.0769) mem 22339MB +[2024-07-25 01:50:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][230/625] eta 0:03:54 lr 0.000885 wd 0.0500 time 0.5661 (0.5926) data time 0.0008 (0.0027) model time 0.5653 (0.5938) loss 8.0180 (7.6322) grad_norm 2.3164 (inf) loss_scale 1024.0000 (1325.4372) mem 22339MB +[2024-07-25 01:50:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][240/625] eta 0:03:47 lr 0.000885 wd 0.0500 time 0.5615 (0.5920) data time 0.0008 (0.0026) model time 0.5607 (0.5930) loss 9.2963 (7.6369) grad_norm 3.3515 (inf) loss_scale 1024.0000 (1312.9295) mem 22339MB +[2024-07-25 01:50:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][250/625] eta 0:03:41 lr 0.000885 wd 0.0500 time 0.5657 (0.5914) data time 0.0006 (0.0025) model time 0.5651 (0.5922) loss 8.4955 (7.6510) grad_norm 2.2184 (inf) loss_scale 1024.0000 (1301.4183) mem 22339MB +[2024-07-25 01:50:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][260/625] eta 0:03:35 lr 0.000885 wd 0.0500 time 0.5639 (0.5909) data time 0.0009 (0.0025) model time 0.5630 (0.5914) loss 7.9013 (7.6546) grad_norm 2.1143 (inf) loss_scale 1024.0000 (1290.7893) mem 22339MB +[2024-07-25 01:50:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][270/625] eta 0:03:29 lr 0.000885 wd 0.0500 time 0.5625 (0.5904) data time 0.0007 (0.0024) model time 0.5618 (0.5907) loss 8.5275 (7.6682) grad_norm 2.1749 (inf) loss_scale 1024.0000 (1280.9446) mem 22339MB +[2024-07-25 01:50:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][280/625] eta 0:03:23 lr 0.000885 wd 0.0500 time 0.5673 (0.5899) data time 0.0008 (0.0024) model time 0.5665 (0.5901) loss 7.3717 (7.6560) grad_norm 3.4386 (inf) loss_scale 1024.0000 (1271.8007) mem 22339MB +[2024-07-25 01:50:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][290/625] eta 0:03:17 lr 0.000885 wd 0.0500 time 0.5640 (0.5894) data time 0.0006 (0.0023) model time 0.5634 (0.5895) loss 9.4650 (7.6576) grad_norm 2.1170 (inf) loss_scale 1024.0000 (1263.2852) mem 22339MB +[2024-07-25 01:50:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][300/625] eta 0:03:11 lr 0.000885 wd 0.0500 time 0.5693 (0.5891) data time 0.0008 (0.0023) model time 0.5685 (0.5890) loss 9.0898 (7.6645) grad_norm 1.7501 (inf) loss_scale 1024.0000 (1255.3355) mem 22339MB +[2024-07-25 01:50:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][310/625] eta 0:03:05 lr 0.000885 wd 0.0500 time 0.5646 (0.5887) data time 0.0007 (0.0022) model time 0.5640 (0.5886) loss 8.5617 (7.6519) grad_norm 3.8880 (inf) loss_scale 1024.0000 (1247.8971) mem 22339MB +[2024-07-25 01:51:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][320/625] eta 0:02:59 lr 0.000884 wd 0.0500 time 0.5638 (0.5886) data time 0.0009 (0.0022) model time 0.5630 (0.5884) loss 7.5811 (7.6460) grad_norm 2.7177 (inf) loss_scale 1024.0000 (1240.9221) mem 22339MB +[2024-07-25 01:51:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][330/625] eta 0:02:53 lr 0.000884 wd 0.0500 time 0.7553 (0.5891) data time 0.0007 (0.0021) model time 0.7546 (0.5889) loss 9.0691 (7.6527) grad_norm 1.8362 (inf) loss_scale 1024.0000 (1234.3686) mem 22339MB +[2024-07-25 01:51:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][340/625] eta 0:02:48 lr 0.000884 wd 0.0500 time 0.7035 (0.5901) data time 0.0006 (0.0021) model time 0.7029 (0.5902) loss 6.1462 (7.6603) grad_norm 1.7844 (inf) loss_scale 1024.0000 (1228.1994) mem 22339MB +[2024-07-25 01:51:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][350/625] eta 0:02:42 lr 0.000884 wd 0.0500 time 0.7031 (0.5918) data time 0.0006 (0.0021) model time 0.7025 (0.5921) loss 7.1340 (7.6539) grad_norm 1.7430 (inf) loss_scale 1024.0000 (1222.3818) mem 22339MB +[2024-07-25 01:51:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][360/625] eta 0:02:36 lr 0.000884 wd 0.0500 time 0.6282 (0.5920) data time 0.0008 (0.0020) model time 0.6274 (0.5922) loss 6.4271 (7.6560) grad_norm 1.9768 (inf) loss_scale 1024.0000 (1216.8864) mem 22339MB +[2024-07-25 01:51:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][370/625] eta 0:02:30 lr 0.000884 wd 0.0500 time 0.5626 (0.5920) data time 0.0006 (0.0020) model time 0.5620 (0.5922) loss 7.8694 (7.6615) grad_norm 4.1237 (inf) loss_scale 1024.0000 (1211.6873) mem 22339MB +[2024-07-25 01:51:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][380/625] eta 0:02:24 lr 0.000884 wd 0.0500 time 0.5678 (0.5916) data time 0.0006 (0.0020) model time 0.5671 (0.5917) loss 6.9664 (7.6615) grad_norm 1.4791 (inf) loss_scale 1024.0000 (1206.7612) mem 22339MB +[2024-07-25 01:51:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][390/625] eta 0:02:18 lr 0.000884 wd 0.0500 time 0.5717 (0.5913) data time 0.0008 (0.0019) model time 0.5710 (0.5913) loss 8.6837 (7.6694) grad_norm 3.5864 (inf) loss_scale 1024.0000 (1202.0870) mem 22339MB +[2024-07-25 01:51:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][400/625] eta 0:02:12 lr 0.000884 wd 0.0500 time 0.5627 (0.5909) data time 0.0006 (0.0019) model time 0.5621 (0.5909) loss 6.7379 (7.6634) grad_norm 1.8950 (inf) loss_scale 1024.0000 (1197.6459) mem 22339MB +[2024-07-25 01:51:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][410/625] eta 0:02:06 lr 0.000884 wd 0.0500 time 0.5602 (0.5905) data time 0.0007 (0.0019) model time 0.5595 (0.5904) loss 7.6686 (7.6539) grad_norm 2.1431 (inf) loss_scale 1024.0000 (1193.4209) mem 22339MB +[2024-07-25 01:52:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][420/625] eta 0:02:00 lr 0.000884 wd 0.0500 time 0.5628 (0.5902) data time 0.0006 (0.0019) model time 0.5622 (0.5900) loss 7.2626 (7.6527) grad_norm 3.2661 (inf) loss_scale 1024.0000 (1189.3967) mem 22339MB +[2024-07-25 01:52:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][430/625] eta 0:01:55 lr 0.000883 wd 0.0500 time 0.5655 (0.5899) data time 0.0006 (0.0018) model time 0.5649 (0.5897) loss 7.5255 (7.6512) grad_norm 2.2625 (inf) loss_scale 1024.0000 (1185.5592) mem 22339MB +[2024-07-25 01:52:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][440/625] eta 0:01:49 lr 0.000883 wd 0.0500 time 0.5623 (0.5896) data time 0.0006 (0.0018) model time 0.5617 (0.5893) loss 6.2012 (7.6554) grad_norm 1.5912 (inf) loss_scale 1024.0000 (1181.8957) mem 22339MB +[2024-07-25 01:52:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][450/625] eta 0:01:43 lr 0.000883 wd 0.0500 time 0.5652 (0.5893) data time 0.0008 (0.0018) model time 0.5643 (0.5890) loss 8.5054 (7.6687) grad_norm 2.1023 (inf) loss_scale 1024.0000 (1178.3947) mem 22339MB +[2024-07-25 01:52:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][460/625] eta 0:01:37 lr 0.000883 wd 0.0500 time 0.5676 (0.5890) data time 0.0006 (0.0018) model time 0.5670 (0.5886) loss 8.7647 (7.6744) grad_norm 2.7512 (inf) loss_scale 1024.0000 (1175.0456) mem 22339MB +[2024-07-25 01:52:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][470/625] eta 0:01:31 lr 0.000883 wd 0.0500 time 0.5851 (0.5888) data time 0.0008 (0.0018) model time 0.5843 (0.5884) loss 7.9998 (7.6703) grad_norm 2.3321 (inf) loss_scale 1024.0000 (1171.8386) mem 22339MB +[2024-07-25 01:52:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][480/625] eta 0:01:25 lr 0.000883 wd 0.0500 time 0.5656 (0.5886) data time 0.0006 (0.0017) model time 0.5650 (0.5881) loss 7.7608 (7.6630) grad_norm 3.5718 (inf) loss_scale 1024.0000 (1168.7651) mem 22339MB +[2024-07-25 01:52:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][490/625] eta 0:01:19 lr 0.000883 wd 0.0500 time 0.5664 (0.5885) data time 0.0008 (0.0017) model time 0.5657 (0.5880) loss 7.3580 (7.6539) grad_norm 2.3284 (inf) loss_scale 1024.0000 (1165.8167) mem 22339MB +[2024-07-25 01:52:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][500/625] eta 0:01:13 lr 0.000883 wd 0.0500 time 0.5618 (0.5882) data time 0.0007 (0.0017) model time 0.5612 (0.5877) loss 7.7601 (7.6629) grad_norm 2.2598 (inf) loss_scale 1024.0000 (1162.9860) mem 22339MB +[2024-07-25 01:52:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][510/625] eta 0:01:07 lr 0.000883 wd 0.0500 time 0.5621 (0.5882) data time 0.0006 (0.0017) model time 0.5615 (0.5877) loss 8.9333 (7.6785) grad_norm 2.0128 (inf) loss_scale 1024.0000 (1160.2661) mem 22339MB +[2024-07-25 01:53:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][520/625] eta 0:01:01 lr 0.000883 wd 0.0500 time 0.5631 (0.5883) data time 0.0008 (0.0017) model time 0.5624 (0.5878) loss 8.0090 (7.6837) grad_norm 1.6890 (inf) loss_scale 1024.0000 (1157.6507) mem 22339MB +[2024-07-25 01:53:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][530/625] eta 0:00:55 lr 0.000882 wd 0.0500 time 0.5585 (0.5883) data time 0.0008 (0.0017) model time 0.5576 (0.5877) loss 8.0336 (7.6801) grad_norm 1.5252 (inf) loss_scale 1024.0000 (1155.1337) mem 22339MB +[2024-07-25 01:53:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][540/625] eta 0:00:50 lr 0.000882 wd 0.0500 time 0.5628 (0.5883) data time 0.0008 (0.0017) model time 0.5620 (0.5878) loss 7.0008 (7.6681) grad_norm 2.3766 (inf) loss_scale 1024.0000 (1152.7098) mem 22339MB +[2024-07-25 01:53:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][550/625] eta 0:00:44 lr 0.000882 wd 0.0500 time 0.7472 (0.5885) data time 0.0008 (0.0016) model time 0.7463 (0.5879) loss 6.2213 (7.6697) grad_norm 1.6462 (inf) loss_scale 1024.0000 (1150.3739) mem 22339MB +[2024-07-25 01:53:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][560/625] eta 0:00:38 lr 0.000882 wd 0.0500 time 0.7011 (0.5900) data time 0.0006 (0.0016) model time 0.7004 (0.5896) loss 7.6077 (7.6646) grad_norm 2.0756 (inf) loss_scale 1024.0000 (1148.1212) mem 22339MB +[2024-07-25 01:53:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][570/625] eta 0:00:32 lr 0.000882 wd 0.0500 time 0.7298 (0.5915) data time 0.0006 (0.0016) model time 0.7292 (0.5913) loss 7.2302 (7.6614) grad_norm 1.7745 (inf) loss_scale 1024.0000 (1145.9475) mem 22339MB +[2024-07-25 01:53:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][580/625] eta 0:00:26 lr 0.000882 wd 0.0500 time 0.7146 (0.5920) data time 0.0008 (0.0016) model time 0.7138 (0.5917) loss 9.1023 (7.6658) grad_norm 2.0140 (inf) loss_scale 1024.0000 (1143.8485) mem 22339MB +[2024-07-25 01:53:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][590/625] eta 0:00:20 lr 0.000882 wd 0.0500 time 0.5652 (0.5920) data time 0.0008 (0.0016) model time 0.5644 (0.5918) loss 7.5054 (7.6751) grad_norm 2.7005 (inf) loss_scale 1024.0000 (1141.8206) mem 22339MB +[2024-07-25 01:53:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][600/625] eta 0:00:14 lr 0.000882 wd 0.0500 time 0.5667 (0.5918) data time 0.0008 (0.0016) model time 0.5660 (0.5915) loss 6.8605 (7.6809) grad_norm 1.7455 (inf) loss_scale 1024.0000 (1139.8602) mem 22339MB +[2024-07-25 01:53:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][610/625] eta 0:00:08 lr 0.000882 wd 0.0500 time 0.5615 (0.5916) data time 0.0004 (0.0016) model time 0.5611 (0.5913) loss 7.6743 (7.6736) grad_norm 2.5285 (inf) loss_scale 1024.0000 (1137.9640) mem 22339MB +[2024-07-25 01:54:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [116/300][620/625] eta 0:00:02 lr 0.000882 wd 0.0500 time 0.5654 (0.5915) data time 0.0006 (0.0016) model time 0.5648 (0.5912) loss 6.2383 (7.6755) grad_norm 2.7106 (inf) loss_scale 1024.0000 (1136.1288) mem 22339MB +[2024-07-25 01:54:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 116 training takes 0:06:09 +[2024-07-25 01:54:05 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 01:54:06 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 01:54:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.462 (0.462) Loss 0.5469 (0.5469) Acc@1 88.672 (88.672) Acc@5 98.242 (98.242) Mem 22339MB +[2024-07-25 01:54:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.156) Loss 0.8857 (0.6760) Acc@1 79.150 (85.463) Acc@5 95.264 (97.381) Mem 22339MB +[2024-07-25 01:54:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9697 (0.8012) Acc@1 76.611 (81.964) Acc@5 94.043 (96.019) Mem 22339MB +[2024-07-25 01:54:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.700 Acc@5 96.059 +[2024-07-25 01:54:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.7% +[2024-07-25 01:54:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.816 (0.816) Loss 0.5034 (0.5034) Acc@1 89.355 (89.355) Acc@5 98.682 (98.682) Mem 22339MB +[2024-07-25 01:54:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.188) Loss 0.8130 (0.6385) Acc@1 80.469 (86.084) Acc@5 96.094 (97.661) Mem 22339MB +[2024-07-25 01:54:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.158) Loss 0.9282 (0.7503) Acc@1 76.514 (82.833) Acc@5 95.361 (96.519) Mem 22339MB +[2024-07-25 01:54:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.504 Acc@5 96.525 +[2024-07-25 01:54:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.5% +[2024-07-25 01:54:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.50% +[2024-07-25 01:54:13 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 01:54:15 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 01:54:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][0/625] eta 0:09:15 lr 0.000882 wd 0.0500 time 0.8891 (0.8891) data time 0.3709 (0.3709) model time 0.0000 (0.0000) loss 8.1833 (8.1833) grad_norm 2.0309 (2.0309) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:54:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][10/625] eta 0:06:12 lr 0.000881 wd 0.0500 time 0.5631 (0.6059) data time 0.0006 (0.0345) model time 0.0000 (0.0000) loss 6.8261 (7.6408) grad_norm 2.6343 (2.0458) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:54:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][20/625] eta 0:05:58 lr 0.000881 wd 0.0500 time 0.5634 (0.5925) data time 0.0006 (0.0185) model time 0.0000 (0.0000) loss 7.1638 (7.6012) grad_norm 2.3342 (2.0816) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:54:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][30/625] eta 0:05:49 lr 0.000881 wd 0.0500 time 0.5650 (0.5876) data time 0.0006 (0.0128) model time 0.0000 (0.0000) loss 5.6945 (7.5347) grad_norm 1.7554 (2.2032) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:54:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][40/625] eta 0:05:42 lr 0.000881 wd 0.0500 time 0.5646 (0.5851) data time 0.0006 (0.0099) model time 0.0000 (0.0000) loss 8.1916 (7.5392) grad_norm 2.1329 (2.2158) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:54:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][50/625] eta 0:05:35 lr 0.000881 wd 0.0500 time 0.5644 (0.5835) data time 0.0008 (0.0081) model time 0.0000 (0.0000) loss 7.2100 (7.5818) grad_norm 2.3515 (2.2509) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:54:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][60/625] eta 0:05:29 lr 0.000881 wd 0.0500 time 0.5652 (0.5824) data time 0.0006 (0.0069) model time 0.5646 (0.5760) loss 7.3465 (7.5771) grad_norm 1.9955 (2.2530) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:54:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][70/625] eta 0:05:23 lr 0.000881 wd 0.0500 time 0.5646 (0.5821) data time 0.0006 (0.0060) model time 0.5640 (0.5778) loss 8.7958 (7.6184) grad_norm 2.0939 (2.2218) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:55:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][80/625] eta 0:05:16 lr 0.000881 wd 0.0500 time 0.5697 (0.5816) data time 0.0009 (0.0054) model time 0.5688 (0.5775) loss 8.9431 (7.6693) grad_norm 3.6423 (2.1993) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:55:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][90/625] eta 0:05:10 lr 0.000881 wd 0.0500 time 0.5655 (0.5813) data time 0.0006 (0.0049) model time 0.5649 (0.5776) loss 6.6041 (7.6826) grad_norm 2.5644 (2.2192) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:55:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][100/625] eta 0:05:04 lr 0.000881 wd 0.0500 time 0.5624 (0.5809) data time 0.0006 (0.0045) model time 0.5617 (0.5773) loss 8.7241 (7.6630) grad_norm 2.5279 (2.2532) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:55:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][110/625] eta 0:04:59 lr 0.000881 wd 0.0500 time 0.5709 (0.5806) data time 0.0008 (0.0042) model time 0.5701 (0.5773) loss 7.1831 (7.6444) grad_norm 2.5310 (2.2400) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:55:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][120/625] eta 0:04:53 lr 0.000880 wd 0.0500 time 0.5721 (0.5804) data time 0.0006 (0.0039) model time 0.5715 (0.5772) loss 6.9556 (7.6427) grad_norm 1.7275 (2.2419) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:55:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][130/625] eta 0:04:47 lr 0.000880 wd 0.0500 time 0.5678 (0.5804) data time 0.0006 (0.0037) model time 0.5671 (0.5775) loss 6.3040 (7.6690) grad_norm 1.9430 (2.2491) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:55:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][140/625] eta 0:04:41 lr 0.000880 wd 0.0500 time 0.5709 (0.5803) data time 0.0008 (0.0035) model time 0.5701 (0.5775) loss 7.4008 (7.6815) grad_norm 1.6638 (2.2416) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:55:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][150/625] eta 0:04:37 lr 0.000880 wd 0.0500 time 0.5626 (0.5835) data time 0.0008 (0.0033) model time 0.5618 (0.5826) loss 6.8149 (7.6596) grad_norm 2.0500 (2.2404) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:55:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][160/625] eta 0:04:32 lr 0.000880 wd 0.0500 time 0.7475 (0.5868) data time 0.0006 (0.0031) model time 0.7469 (0.5874) loss 6.8474 (7.6275) grad_norm 3.0922 (2.2772) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:55:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][170/625] eta 0:04:27 lr 0.000880 wd 0.0500 time 0.5637 (0.5887) data time 0.0008 (0.0030) model time 0.5630 (0.5901) loss 8.6359 (7.6358) grad_norm 1.8310 (2.2685) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:56:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][180/625] eta 0:04:22 lr 0.000880 wd 0.0500 time 0.5682 (0.5899) data time 0.0006 (0.0029) model time 0.5676 (0.5915) loss 8.3542 (7.6314) grad_norm 2.2867 (2.2901) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:56:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][190/625] eta 0:04:16 lr 0.000880 wd 0.0500 time 0.5684 (0.5893) data time 0.0008 (0.0028) model time 0.5676 (0.5906) loss 8.4369 (7.6424) grad_norm 1.9407 (2.3012) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:56:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][200/625] eta 0:04:10 lr 0.000880 wd 0.0500 time 0.5649 (0.5887) data time 0.0008 (0.0027) model time 0.5640 (0.5896) loss 8.4676 (7.6339) grad_norm 2.7867 (2.2871) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:56:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][210/625] eta 0:04:04 lr 0.000880 wd 0.0500 time 0.5656 (0.5882) data time 0.0006 (0.0026) model time 0.5650 (0.5889) loss 7.9185 (7.6481) grad_norm 1.5942 (2.2850) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:56:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][220/625] eta 0:03:58 lr 0.000880 wd 0.0500 time 0.5612 (0.5877) data time 0.0009 (0.0025) model time 0.5603 (0.5881) loss 7.0299 (7.6306) grad_norm 2.1978 (2.2744) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:56:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][230/625] eta 0:03:51 lr 0.000879 wd 0.0500 time 0.5633 (0.5873) data time 0.0008 (0.0025) model time 0.5625 (0.5875) loss 8.9815 (7.6433) grad_norm 2.1208 (2.2770) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:56:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][240/625] eta 0:03:45 lr 0.000879 wd 0.0500 time 0.5623 (0.5869) data time 0.0006 (0.0024) model time 0.5617 (0.5870) loss 7.2688 (7.6406) grad_norm 1.7339 (2.2961) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:56:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][250/625] eta 0:03:39 lr 0.000879 wd 0.0500 time 0.5603 (0.5865) data time 0.0006 (0.0023) model time 0.5596 (0.5865) loss 6.6074 (7.6464) grad_norm 1.6709 (2.2916) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:56:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][260/625] eta 0:03:33 lr 0.000879 wd 0.0500 time 0.5619 (0.5862) data time 0.0007 (0.0023) model time 0.5612 (0.5861) loss 6.7535 (7.6574) grad_norm 2.3868 (2.2897) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:56:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][270/625] eta 0:03:28 lr 0.000879 wd 0.0500 time 0.5698 (0.5859) data time 0.0008 (0.0022) model time 0.5690 (0.5856) loss 9.0067 (7.6705) grad_norm 1.6784 (2.2787) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:56:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][280/625] eta 0:03:22 lr 0.000879 wd 0.0500 time 0.5659 (0.5856) data time 0.0006 (0.0022) model time 0.5653 (0.5852) loss 9.7200 (7.6776) grad_norm 3.3109 (2.2753) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:57:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][290/625] eta 0:03:16 lr 0.000879 wd 0.0500 time 0.5637 (0.5859) data time 0.0006 (0.0021) model time 0.5631 (0.5855) loss 6.6326 (7.6768) grad_norm 1.8444 (2.2651) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:57:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][300/625] eta 0:03:10 lr 0.000879 wd 0.0500 time 0.5691 (0.5856) data time 0.0009 (0.0021) model time 0.5681 (0.5852) loss 8.8728 (7.6725) grad_norm 1.8583 (2.2482) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:57:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][310/625] eta 0:03:04 lr 0.000879 wd 0.0500 time 0.5671 (0.5853) data time 0.0008 (0.0021) model time 0.5663 (0.5848) loss 8.7647 (7.6586) grad_norm 2.5565 (2.2407) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:57:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][320/625] eta 0:02:58 lr 0.000879 wd 0.0500 time 0.5656 (0.5851) data time 0.0006 (0.0020) model time 0.5650 (0.5845) loss 7.4124 (7.6446) grad_norm 3.2463 (2.2369) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:57:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][330/625] eta 0:02:52 lr 0.000878 wd 0.0500 time 0.5707 (0.5849) data time 0.0008 (0.0020) model time 0.5699 (0.5842) loss 7.9818 (7.6522) grad_norm 3.1034 (2.2456) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:57:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][340/625] eta 0:02:46 lr 0.000878 wd 0.0500 time 0.5671 (0.5846) data time 0.0007 (0.0019) model time 0.5664 (0.5840) loss 7.6673 (7.6505) grad_norm 3.1296 (2.2592) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:57:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][350/625] eta 0:02:40 lr 0.000878 wd 0.0500 time 0.5670 (0.5845) data time 0.0008 (0.0019) model time 0.5662 (0.5838) loss 6.7662 (7.6502) grad_norm 2.0678 (2.2614) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:57:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][360/625] eta 0:02:34 lr 0.000878 wd 0.0500 time 0.5636 (0.5846) data time 0.0006 (0.0019) model time 0.5630 (0.5839) loss 8.4970 (7.6371) grad_norm 2.4932 (2.2759) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:57:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][370/625] eta 0:02:29 lr 0.000878 wd 0.0500 time 0.7159 (0.5853) data time 0.0008 (0.0019) model time 0.7151 (0.5847) loss 7.2997 (7.6440) grad_norm 1.7509 (2.2899) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:57:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][380/625] eta 0:02:23 lr 0.000878 wd 0.0500 time 0.7479 (0.5868) data time 0.0006 (0.0019) model time 0.7473 (0.5865) loss 8.0142 (7.6407) grad_norm 2.6737 (2.2813) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:58:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][390/625] eta 0:02:18 lr 0.000878 wd 0.0500 time 0.5659 (0.5873) data time 0.0008 (0.0018) model time 0.5651 (0.5870) loss 6.8373 (7.6413) grad_norm 1.9144 (2.2708) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:58:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][400/625] eta 0:02:12 lr 0.000878 wd 0.0500 time 0.5605 (0.5878) data time 0.0008 (0.0018) model time 0.5597 (0.5875) loss 6.7013 (7.6381) grad_norm 1.9038 (2.2690) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:58:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][410/625] eta 0:02:06 lr 0.000878 wd 0.0500 time 0.5624 (0.5876) data time 0.0009 (0.0018) model time 0.5616 (0.5873) loss 6.2465 (7.6378) grad_norm 3.0221 (2.2779) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:58:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][420/625] eta 0:02:00 lr 0.000878 wd 0.0500 time 0.5649 (0.5875) data time 0.0008 (0.0018) model time 0.5640 (0.5872) loss 8.2187 (7.6393) grad_norm 2.1553 (2.2833) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:58:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][430/625] eta 0:01:54 lr 0.000878 wd 0.0500 time 0.5639 (0.5875) data time 0.0008 (0.0018) model time 0.5631 (0.5871) loss 9.1353 (7.6439) grad_norm 2.6771 (2.2868) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:58:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][440/625] eta 0:01:48 lr 0.000877 wd 0.0500 time 0.5796 (0.5873) data time 0.0006 (0.0017) model time 0.5789 (0.5869) loss 6.0736 (7.6405) grad_norm 1.8692 (2.2824) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:58:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][450/625] eta 0:01:42 lr 0.000877 wd 0.0500 time 0.5625 (0.5870) data time 0.0006 (0.0017) model time 0.5620 (0.5866) loss 7.2963 (7.6453) grad_norm 2.4967 (2.2806) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:58:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][460/625] eta 0:01:36 lr 0.000877 wd 0.0500 time 0.5660 (0.5868) data time 0.0006 (0.0017) model time 0.5654 (0.5864) loss 7.7532 (7.6474) grad_norm 1.5270 (2.2736) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:58:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][470/625] eta 0:01:30 lr 0.000877 wd 0.0500 time 0.5683 (0.5866) data time 0.0006 (0.0017) model time 0.5677 (0.5861) loss 7.6635 (7.6388) grad_norm 2.1634 (2.2704) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:58:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][480/625] eta 0:01:25 lr 0.000877 wd 0.0500 time 0.5722 (0.5865) data time 0.0008 (0.0017) model time 0.5714 (0.5859) loss 7.9722 (7.6393) grad_norm 4.3434 (2.2779) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:59:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][490/625] eta 0:01:19 lr 0.000877 wd 0.0500 time 0.5693 (0.5863) data time 0.0006 (0.0016) model time 0.5686 (0.5857) loss 7.6622 (7.6364) grad_norm 1.8525 (2.2795) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:59:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][500/625] eta 0:01:13 lr 0.000877 wd 0.0500 time 0.5640 (0.5861) data time 0.0009 (0.0016) model time 0.5632 (0.5855) loss 8.7131 (7.6360) grad_norm 2.2968 (2.2766) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:59:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][510/625] eta 0:01:07 lr 0.000877 wd 0.0500 time 0.5622 (0.5863) data time 0.0007 (0.0016) model time 0.5615 (0.5857) loss 7.9531 (7.6273) grad_norm 1.9331 (2.2706) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:59:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][520/625] eta 0:01:01 lr 0.000877 wd 0.0500 time 0.5609 (0.5862) data time 0.0008 (0.0016) model time 0.5601 (0.5856) loss 6.5273 (7.6254) grad_norm 2.1158 (2.2696) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:59:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][530/625] eta 0:00:55 lr 0.000877 wd 0.0500 time 0.5618 (0.5863) data time 0.0006 (0.0016) model time 0.5612 (0.5856) loss 9.0798 (7.6300) grad_norm 2.5534 (2.2641) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:59:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][540/625] eta 0:00:49 lr 0.000876 wd 0.0500 time 0.5655 (0.5861) data time 0.0008 (0.0016) model time 0.5647 (0.5855) loss 7.8409 (7.6356) grad_norm 2.4237 (2.2645) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:59:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][550/625] eta 0:00:43 lr 0.000876 wd 0.0500 time 0.5680 (0.5859) data time 0.0008 (0.0016) model time 0.5673 (0.5853) loss 8.0500 (7.6427) grad_norm 1.7138 (2.2632) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:59:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][560/625] eta 0:00:38 lr 0.000876 wd 0.0500 time 0.5649 (0.5858) data time 0.0006 (0.0016) model time 0.5643 (0.5851) loss 6.8359 (7.6433) grad_norm 1.7922 (2.2685) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:59:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][570/625] eta 0:00:32 lr 0.000876 wd 0.0500 time 0.5655 (0.5856) data time 0.0008 (0.0015) model time 0.5647 (0.5849) loss 8.3416 (7.6443) grad_norm 2.1915 (2.2668) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 01:59:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][580/625] eta 0:00:26 lr 0.000876 wd 0.0500 time 0.5598 (0.5857) data time 0.0006 (0.0015) model time 0.5592 (0.5850) loss 7.0822 (7.6371) grad_norm 2.1158 (2.2620) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:00:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][590/625] eta 0:00:20 lr 0.000876 wd 0.0500 time 0.7448 (0.5868) data time 0.0008 (0.0015) model time 0.7440 (0.5862) loss 8.5017 (7.6399) grad_norm 1.7979 (2.2617) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:00:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][600/625] eta 0:00:14 lr 0.000876 wd 0.0500 time 0.7244 (0.5878) data time 0.0008 (0.0015) model time 0.7236 (0.5873) loss 6.4159 (7.6385) grad_norm 2.3967 (2.2571) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:00:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][610/625] eta 0:00:08 lr 0.000876 wd 0.0500 time 0.7049 (0.5887) data time 0.0004 (0.0015) model time 0.7045 (0.5882) loss 7.0073 (7.6456) grad_norm 2.3102 (2.2654) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:00:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [117/300][620/625] eta 0:00:02 lr 0.000876 wd 0.0500 time 0.5666 (0.5890) data time 0.0004 (0.0015) model time 0.5662 (0.5886) loss 7.6920 (7.6466) grad_norm 2.7069 (2.2697) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:00:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 117 training takes 0:06:08 +[2024-07-25 02:00:23 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 02:00:24 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 02:00:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.477 (0.477) Loss 0.5298 (0.5298) Acc@1 88.818 (88.818) Acc@5 98.682 (98.682) Mem 22339MB +[2024-07-25 02:00:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.8667 (0.6750) Acc@1 80.029 (85.467) Acc@5 95.166 (97.377) Mem 22339MB +[2024-07-25 02:00:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.9849 (0.7959) Acc@1 75.195 (81.992) Acc@5 94.531 (96.077) Mem 22339MB +[2024-07-25 02:00:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.676 Acc@5 96.091 +[2024-07-25 02:00:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.7% +[2024-07-25 02:00:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.841 (0.841) Loss 0.5034 (0.5034) Acc@1 89.307 (89.307) Acc@5 98.682 (98.682) Mem 22339MB +[2024-07-25 02:00:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.191) Loss 0.8105 (0.6377) Acc@1 80.518 (86.102) Acc@5 96.191 (97.692) Mem 22339MB +[2024-07-25 02:00:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.160) Loss 0.9268 (0.7493) Acc@1 76.465 (82.843) Acc@5 95.215 (96.533) Mem 22339MB +[2024-07-25 02:00:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.512 Acc@5 96.541 +[2024-07-25 02:00:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.5% +[2024-07-25 02:00:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.51% +[2024-07-25 02:00:32 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 02:00:33 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 02:00:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][0/625] eta 0:09:53 lr 0.000876 wd 0.0500 time 0.9503 (0.9503) data time 0.4316 (0.4316) model time 0.0000 (0.0000) loss 7.1248 (7.1248) grad_norm 1.6926 (1.6926) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:00:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][10/625] eta 0:06:15 lr 0.000876 wd 0.0500 time 0.5676 (0.6099) data time 0.0006 (0.0401) model time 0.0000 (0.0000) loss 9.0765 (7.7079) grad_norm 1.9450 (1.8715) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:00:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][20/625] eta 0:06:00 lr 0.000875 wd 0.0500 time 0.5653 (0.5960) data time 0.0007 (0.0215) model time 0.0000 (0.0000) loss 7.8463 (7.8762) grad_norm 2.1075 (1.9427) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:00:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][30/625] eta 0:05:50 lr 0.000875 wd 0.0500 time 0.5648 (0.5898) data time 0.0006 (0.0149) model time 0.0000 (0.0000) loss 7.7879 (7.7932) grad_norm 2.2916 (1.9685) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:00:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][40/625] eta 0:05:43 lr 0.000875 wd 0.0500 time 0.5661 (0.5869) data time 0.0009 (0.0114) model time 0.0000 (0.0000) loss 8.0956 (7.7626) grad_norm 4.8962 (2.0941) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:01:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][50/625] eta 0:05:38 lr 0.000875 wd 0.0500 time 0.5648 (0.5882) data time 0.0008 (0.0094) model time 0.0000 (0.0000) loss 6.5994 (7.6634) grad_norm 1.9083 (2.1131) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:01:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][60/625] eta 0:05:31 lr 0.000875 wd 0.0500 time 0.5669 (0.5868) data time 0.0008 (0.0084) model time 0.5662 (0.5761) loss 8.5846 (7.6251) grad_norm 1.7338 (2.1397) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:01:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][70/625] eta 0:05:25 lr 0.000875 wd 0.0500 time 0.5662 (0.5856) data time 0.0006 (0.0073) model time 0.5656 (0.5768) loss 7.8369 (7.6587) grad_norm 1.6684 (2.1232) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:01:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][80/625] eta 0:05:18 lr 0.000875 wd 0.0500 time 0.5669 (0.5848) data time 0.0008 (0.0065) model time 0.5662 (0.5772) loss 8.0901 (7.6704) grad_norm 1.4650 (2.1297) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:01:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][90/625] eta 0:05:12 lr 0.000875 wd 0.0500 time 0.5607 (0.5844) data time 0.0006 (0.0059) model time 0.5602 (0.5780) loss 7.5100 (7.6806) grad_norm 2.6961 (2.1440) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:01:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][100/625] eta 0:05:06 lr 0.000875 wd 0.0500 time 0.5666 (0.5843) data time 0.0008 (0.0054) model time 0.5658 (0.5789) loss 8.8255 (7.7333) grad_norm 1.6755 (2.1436) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:01:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][110/625] eta 0:05:00 lr 0.000875 wd 0.0500 time 0.5619 (0.5843) data time 0.0008 (0.0050) model time 0.5611 (0.5796) loss 7.4453 (7.7057) grad_norm 2.3377 (2.1534) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:01:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][120/625] eta 0:04:54 lr 0.000875 wd 0.0500 time 0.5674 (0.5838) data time 0.0010 (0.0047) model time 0.5665 (0.5793) loss 8.6135 (7.6800) grad_norm 3.5924 (2.1615) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:01:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][130/625] eta 0:04:48 lr 0.000874 wd 0.0500 time 0.5659 (0.5835) data time 0.0008 (0.0044) model time 0.5651 (0.5792) loss 8.0678 (7.6662) grad_norm 1.7746 (2.1573) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:01:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][140/625] eta 0:04:42 lr 0.000874 wd 0.0500 time 0.5643 (0.5834) data time 0.0007 (0.0041) model time 0.5635 (0.5794) loss 8.1174 (7.6821) grad_norm 1.7712 (2.1434) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:02:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][150/625] eta 0:04:37 lr 0.000874 wd 0.0500 time 0.5664 (0.5833) data time 0.0007 (0.0039) model time 0.5658 (0.5796) loss 8.4663 (7.6699) grad_norm 1.9718 (2.1373) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:02:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][160/625] eta 0:04:31 lr 0.000874 wd 0.0500 time 0.5662 (0.5832) data time 0.0006 (0.0037) model time 0.5656 (0.5798) loss 7.0554 (7.7062) grad_norm 1.6121 (2.1469) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:02:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][170/625] eta 0:04:25 lr 0.000874 wd 0.0500 time 0.5666 (0.5834) data time 0.0008 (0.0036) model time 0.5658 (0.5802) loss 8.5547 (7.7081) grad_norm 1.9072 (2.1538) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:02:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][180/625] eta 0:04:19 lr 0.000874 wd 0.0500 time 0.5629 (0.5832) data time 0.0006 (0.0034) model time 0.5623 (0.5801) loss 7.7829 (7.7125) grad_norm 1.9359 (2.1425) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:02:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][190/625] eta 0:04:15 lr 0.000874 wd 0.0500 time 0.5592 (0.5871) data time 0.0007 (0.0033) model time 0.5585 (0.5857) loss 6.9716 (7.6843) grad_norm 1.8384 (2.1516) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:02:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][200/625] eta 0:04:11 lr 0.000874 wd 0.0500 time 0.5688 (0.5921) data time 0.0008 (0.0032) model time 0.5680 (0.5923) loss 8.6122 (7.6852) grad_norm 3.2857 (2.1707) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:02:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][210/625] eta 0:04:06 lr 0.000874 wd 0.0500 time 0.7300 (0.5941) data time 0.0010 (0.0031) model time 0.7290 (0.5949) loss 9.0458 (7.6952) grad_norm 1.8153 (2.2066) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:02:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][220/625] eta 0:04:00 lr 0.000874 wd 0.0500 time 0.5658 (0.5934) data time 0.0008 (0.0030) model time 0.5650 (0.5939) loss 7.5961 (7.6912) grad_norm 2.0059 (2.1953) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:02:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][230/625] eta 0:03:54 lr 0.000873 wd 0.0500 time 0.5654 (0.5926) data time 0.0006 (0.0029) model time 0.5648 (0.5928) loss 7.4283 (7.6815) grad_norm 1.8442 (2.1789) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:02:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][240/625] eta 0:03:47 lr 0.000873 wd 0.0500 time 0.5602 (0.5920) data time 0.0009 (0.0028) model time 0.5593 (0.5919) loss 6.4015 (7.6725) grad_norm 2.5176 (2.1753) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:03:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][250/625] eta 0:03:41 lr 0.000873 wd 0.0500 time 0.5633 (0.5913) data time 0.0006 (0.0027) model time 0.5627 (0.5911) loss 6.1647 (7.6751) grad_norm 1.9791 (2.1678) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:03:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][260/625] eta 0:03:35 lr 0.000873 wd 0.0500 time 0.5731 (0.5909) data time 0.0006 (0.0026) model time 0.5725 (0.5905) loss 8.1997 (7.6929) grad_norm 1.8361 (2.1594) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:03:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][270/625] eta 0:03:29 lr 0.000873 wd 0.0500 time 0.5703 (0.5909) data time 0.0006 (0.0026) model time 0.5698 (0.5906) loss 8.2257 (7.7017) grad_norm 2.1911 (2.1523) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:03:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][280/625] eta 0:03:23 lr 0.000873 wd 0.0500 time 0.5619 (0.5905) data time 0.0006 (0.0025) model time 0.5613 (0.5900) loss 8.0017 (7.6952) grad_norm 2.0314 (2.1470) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:03:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][290/625] eta 0:03:17 lr 0.000873 wd 0.0500 time 0.5681 (0.5901) data time 0.0008 (0.0024) model time 0.5673 (0.5895) loss 9.1938 (7.6962) grad_norm 1.6476 (2.1439) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:03:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][300/625] eta 0:03:11 lr 0.000873 wd 0.0500 time 0.5653 (0.5897) data time 0.0008 (0.0024) model time 0.5646 (0.5890) loss 6.8206 (7.6933) grad_norm 1.7816 (2.1377) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:03:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][310/625] eta 0:03:05 lr 0.000873 wd 0.0500 time 0.5607 (0.5893) data time 0.0006 (0.0023) model time 0.5600 (0.5885) loss 8.6280 (7.6998) grad_norm 2.0257 (2.1313) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:03:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][320/625] eta 0:02:59 lr 0.000873 wd 0.0500 time 0.5651 (0.5889) data time 0.0008 (0.0023) model time 0.5643 (0.5881) loss 7.8245 (7.6941) grad_norm 1.8823 (2.1289) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:03:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][330/625] eta 0:02:53 lr 0.000873 wd 0.0500 time 0.5625 (0.5886) data time 0.0006 (0.0022) model time 0.5620 (0.5877) loss 8.0266 (7.6760) grad_norm 1.7360 (2.1193) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:03:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][340/625] eta 0:02:47 lr 0.000872 wd 0.0500 time 0.5650 (0.5883) data time 0.0008 (0.0022) model time 0.5642 (0.5874) loss 5.2203 (7.6712) grad_norm 2.2646 (2.1199) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:04:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][350/625] eta 0:02:41 lr 0.000872 wd 0.0500 time 0.5665 (0.5880) data time 0.0006 (0.0022) model time 0.5659 (0.5871) loss 8.8650 (7.6736) grad_norm 2.6669 (2.1290) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:04:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][360/625] eta 0:02:35 lr 0.000872 wd 0.0500 time 0.5627 (0.5878) data time 0.0008 (0.0021) model time 0.5619 (0.5867) loss 5.9906 (7.6800) grad_norm 2.3070 (2.1427) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:04:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][370/625] eta 0:02:29 lr 0.000872 wd 0.0500 time 0.5665 (0.5875) data time 0.0008 (0.0021) model time 0.5657 (0.5864) loss 7.8730 (7.6836) grad_norm 3.0781 (2.1474) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:04:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][380/625] eta 0:02:23 lr 0.000872 wd 0.0500 time 0.5716 (0.5872) data time 0.0008 (0.0021) model time 0.5708 (0.5861) loss 7.1783 (7.6852) grad_norm 1.7826 (2.1518) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:04:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][390/625] eta 0:02:17 lr 0.000872 wd 0.0500 time 0.5635 (0.5869) data time 0.0008 (0.0020) model time 0.5627 (0.5858) loss 7.0232 (7.6817) grad_norm 2.6741 (2.1510) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:04:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][400/625] eta 0:02:12 lr 0.000872 wd 0.0500 time 0.6896 (0.5870) data time 0.0008 (0.0020) model time 0.6888 (0.5859) loss 8.3126 (7.6859) grad_norm 2.0153 (2.1461) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:04:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][410/625] eta 0:02:06 lr 0.000872 wd 0.0500 time 0.6925 (0.5893) data time 0.0009 (0.0020) model time 0.6916 (0.5885) loss 8.7848 (7.6896) grad_norm 2.0632 (2.1411) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:04:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][420/625] eta 0:02:01 lr 0.000872 wd 0.0500 time 0.5644 (0.5916) data time 0.0007 (0.0020) model time 0.5638 (0.5911) loss 7.1626 (7.6905) grad_norm 1.4211 (2.1407) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:04:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][430/625] eta 0:01:55 lr 0.000872 wd 0.0500 time 0.7132 (0.5924) data time 0.0006 (0.0019) model time 0.7126 (0.5920) loss 7.0072 (7.6886) grad_norm 1.6425 (2.1512) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:04:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][440/625] eta 0:01:49 lr 0.000871 wd 0.0500 time 0.5656 (0.5921) data time 0.0009 (0.0019) model time 0.5647 (0.5917) loss 7.3980 (7.6875) grad_norm 2.0754 (2.1518) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:05:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][450/625] eta 0:01:43 lr 0.000871 wd 0.0500 time 0.5639 (0.5918) data time 0.0006 (0.0019) model time 0.5633 (0.5913) loss 7.2394 (7.6826) grad_norm 1.9908 (2.1536) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:05:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][460/625] eta 0:01:37 lr 0.000871 wd 0.0500 time 0.5721 (0.5915) data time 0.0009 (0.0019) model time 0.5712 (0.5909) loss 8.8285 (7.6841) grad_norm 2.0712 (2.1540) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:05:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][470/625] eta 0:01:31 lr 0.000871 wd 0.0500 time 0.5684 (0.5912) data time 0.0008 (0.0019) model time 0.5676 (0.5906) loss 8.4371 (7.6832) grad_norm 1.8589 (2.1540) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:05:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][480/625] eta 0:01:25 lr 0.000871 wd 0.0500 time 0.5640 (0.5911) data time 0.0008 (0.0018) model time 0.5632 (0.5904) loss 6.0871 (7.6769) grad_norm 1.9214 (2.1493) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:05:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][490/625] eta 0:01:19 lr 0.000871 wd 0.0500 time 0.5648 (0.5910) data time 0.0008 (0.0018) model time 0.5640 (0.5904) loss 6.3637 (7.6659) grad_norm 1.9011 (2.1447) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:05:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][500/625] eta 0:01:13 lr 0.000871 wd 0.0500 time 0.5614 (0.5909) data time 0.0009 (0.0018) model time 0.5605 (0.5903) loss 7.2120 (7.6674) grad_norm 3.0001 (2.1446) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:05:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][510/625] eta 0:01:07 lr 0.000871 wd 0.0500 time 0.5673 (0.5908) data time 0.0007 (0.0018) model time 0.5665 (0.5902) loss 8.4349 (7.6686) grad_norm 3.4872 (2.1495) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:05:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][520/625] eta 0:01:02 lr 0.000871 wd 0.0500 time 0.5659 (0.5906) data time 0.0006 (0.0018) model time 0.5653 (0.5899) loss 8.1815 (7.6748) grad_norm 3.8880 (2.1746) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:05:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][530/625] eta 0:00:56 lr 0.000871 wd 0.0500 time 0.5662 (0.5904) data time 0.0007 (0.0018) model time 0.5656 (0.5896) loss 7.4985 (7.6752) grad_norm 2.4925 (2.1833) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:05:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][540/625] eta 0:00:50 lr 0.000871 wd 0.0500 time 0.5611 (0.5902) data time 0.0006 (0.0017) model time 0.5606 (0.5894) loss 7.5748 (7.6766) grad_norm 1.6536 (2.1829) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:05:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][550/625] eta 0:00:44 lr 0.000870 wd 0.0500 time 0.5642 (0.5901) data time 0.0006 (0.0017) model time 0.5636 (0.5893) loss 7.4654 (7.6790) grad_norm 1.8798 (2.1835) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:06:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][560/625] eta 0:00:38 lr 0.000870 wd 0.0500 time 0.5653 (0.5900) data time 0.0008 (0.0017) model time 0.5645 (0.5892) loss 7.9446 (7.6746) grad_norm 2.6897 (2.1804) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:06:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][570/625] eta 0:00:32 lr 0.000870 wd 0.0500 time 0.5684 (0.5898) data time 0.0008 (0.0017) model time 0.5676 (0.5890) loss 7.1804 (7.6685) grad_norm 1.8070 (2.1829) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:06:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][580/625] eta 0:00:26 lr 0.000870 wd 0.0500 time 0.5687 (0.5896) data time 0.0006 (0.0017) model time 0.5680 (0.5888) loss 6.6099 (7.6775) grad_norm 2.5058 (2.1983) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:06:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][590/625] eta 0:00:20 lr 0.000870 wd 0.0500 time 0.5702 (0.5895) data time 0.0006 (0.0017) model time 0.5696 (0.5886) loss 7.8952 (7.6774) grad_norm 2.2850 (2.2005) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:06:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][600/625] eta 0:00:14 lr 0.000870 wd 0.0500 time 0.5651 (0.5893) data time 0.0006 (0.0017) model time 0.5645 (0.5885) loss 7.4665 (7.6888) grad_norm 2.2970 (2.1973) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:06:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][610/625] eta 0:00:08 lr 0.000870 wd 0.0500 time 0.5600 (0.5895) data time 0.0006 (0.0017) model time 0.5594 (0.5887) loss 7.7361 (7.6902) grad_norm 2.0901 (2.1966) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:06:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [118/300][620/625] eta 0:00:02 lr 0.000870 wd 0.0500 time 0.6768 (0.5898) data time 0.0004 (0.0017) model time 0.6764 (0.5890) loss 9.1360 (7.6859) grad_norm 2.8878 (2.2045) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:06:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 118 training takes 0:06:08 +[2024-07-25 02:06:42 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 02:06:43 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 02:06:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.471 (0.471) Loss 0.5459 (0.5459) Acc@1 88.916 (88.916) Acc@5 98.535 (98.535) Mem 22339MB +[2024-07-25 02:06:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.8535 (0.6722) Acc@1 80.615 (85.542) Acc@5 95.850 (97.559) Mem 22339MB +[2024-07-25 02:06:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9917 (0.7948) Acc@1 76.074 (82.134) Acc@5 94.238 (96.150) Mem 22339MB +[2024-07-25 02:06:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.828 Acc@5 96.119 +[2024-07-25 02:06:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.8% +[2024-07-25 02:06:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 81.83% +[2024-07-25 02:06:47 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 02:06:48 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 02:06:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.481 (0.481) Loss 0.5034 (0.5034) Acc@1 89.307 (89.307) Acc@5 98.682 (98.682) Mem 22339MB +[2024-07-25 02:06:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.158) Loss 0.8105 (0.6370) Acc@1 80.811 (86.164) Acc@5 96.191 (97.687) Mem 22339MB +[2024-07-25 02:06:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.9253 (0.7485) Acc@1 76.367 (82.864) Acc@5 95.312 (96.519) Mem 22339MB +[2024-07-25 02:06:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.528 Acc@5 96.533 +[2024-07-25 02:06:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.5% +[2024-07-25 02:06:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.53% +[2024-07-25 02:06:52 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 02:06:53 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 02:06:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][0/625] eta 0:09:35 lr 0.000870 wd 0.0500 time 0.9215 (0.9215) data time 0.4019 (0.4019) model time 0.0000 (0.0000) loss 6.6962 (6.6962) grad_norm 2.7113 (2.7113) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:07:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][10/625] eta 0:07:06 lr 0.000870 wd 0.0500 time 0.7287 (0.6937) data time 0.0006 (0.0372) model time 0.0000 (0.0000) loss 8.5011 (7.3451) grad_norm 1.9218 (2.3252) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:07:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][20/625] eta 0:06:42 lr 0.000870 wd 0.0500 time 0.5608 (0.6649) data time 0.0008 (0.0205) model time 0.0000 (0.0000) loss 8.0582 (7.5988) grad_norm 2.3212 (2.2438) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:07:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][30/625] eta 0:06:26 lr 0.000869 wd 0.0500 time 0.5624 (0.6492) data time 0.0006 (0.0148) model time 0.0000 (0.0000) loss 7.4859 (7.5829) grad_norm 3.1599 (2.1943) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:07:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][40/625] eta 0:06:09 lr 0.000869 wd 0.0500 time 0.5631 (0.6319) data time 0.0007 (0.0114) model time 0.0000 (0.0000) loss 7.9558 (7.5482) grad_norm 2.1267 (2.1925) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:07:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][50/625] eta 0:05:57 lr 0.000869 wd 0.0500 time 0.5614 (0.6221) data time 0.0006 (0.0093) model time 0.0000 (0.0000) loss 7.7184 (7.6760) grad_norm 2.8648 (2.2409) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:07:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][60/625] eta 0:05:48 lr 0.000869 wd 0.0500 time 0.5640 (0.6160) data time 0.0007 (0.0079) model time 0.5632 (0.5840) loss 7.3768 (7.6815) grad_norm 1.7686 (2.2298) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:07:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][70/625] eta 0:05:39 lr 0.000869 wd 0.0500 time 0.5689 (0.6109) data time 0.0008 (0.0069) model time 0.5681 (0.5814) loss 8.7616 (7.6741) grad_norm 2.7337 (2.2508) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:07:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][80/625] eta 0:05:30 lr 0.000869 wd 0.0500 time 0.5660 (0.6068) data time 0.0008 (0.0062) model time 0.5652 (0.5799) loss 8.2196 (7.6719) grad_norm 1.5686 (2.2038) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:07:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][90/625] eta 0:05:22 lr 0.000869 wd 0.0500 time 0.5673 (0.6036) data time 0.0006 (0.0056) model time 0.5667 (0.5791) loss 8.0158 (7.6985) grad_norm 1.7333 (2.1765) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:07:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][100/625] eta 0:05:15 lr 0.000869 wd 0.0500 time 0.5641 (0.6011) data time 0.0006 (0.0051) model time 0.5635 (0.5789) loss 7.7111 (7.6717) grad_norm 1.8413 (2.1359) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:08:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][110/625] eta 0:05:08 lr 0.000869 wd 0.0500 time 0.5626 (0.5989) data time 0.0006 (0.0047) model time 0.5619 (0.5784) loss 8.3787 (7.7295) grad_norm 3.9471 (2.1611) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:08:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][120/625] eta 0:05:01 lr 0.000869 wd 0.0500 time 0.5617 (0.5976) data time 0.0006 (0.0044) model time 0.5611 (0.5788) loss 7.9377 (7.7462) grad_norm 4.0150 (2.2290) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:08:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][130/625] eta 0:04:55 lr 0.000868 wd 0.0500 time 0.5648 (0.5965) data time 0.0006 (0.0045) model time 0.5642 (0.5787) loss 7.3026 (7.7460) grad_norm 2.4966 (2.2096) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:08:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][140/625] eta 0:04:48 lr 0.000868 wd 0.0500 time 0.5670 (0.5956) data time 0.0007 (0.0042) model time 0.5662 (0.5793) loss 8.1397 (7.7576) grad_norm 2.4219 (2.2049) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:08:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][150/625] eta 0:04:42 lr 0.000868 wd 0.0500 time 0.5661 (0.5945) data time 0.0006 (0.0040) model time 0.5655 (0.5790) loss 6.9749 (7.7575) grad_norm 2.4729 (2.2087) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:08:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][160/625] eta 0:04:35 lr 0.000868 wd 0.0500 time 0.5670 (0.5935) data time 0.0008 (0.0038) model time 0.5663 (0.5790) loss 7.6254 (7.7685) grad_norm 1.5719 (2.1854) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:08:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][170/625] eta 0:04:29 lr 0.000868 wd 0.0500 time 0.5636 (0.5926) data time 0.0007 (0.0036) model time 0.5630 (0.5788) loss 7.4518 (7.7486) grad_norm 2.8617 (2.1793) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:08:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][180/625] eta 0:04:23 lr 0.000868 wd 0.0500 time 0.5612 (0.5921) data time 0.0008 (0.0035) model time 0.5604 (0.5791) loss 7.6808 (7.7691) grad_norm 2.2389 (2.1973) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:08:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][190/625] eta 0:04:17 lr 0.000868 wd 0.0500 time 0.5611 (0.5921) data time 0.0008 (0.0033) model time 0.5604 (0.5800) loss 7.5267 (7.7451) grad_norm 1.6605 (2.1909) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:08:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][200/625] eta 0:04:11 lr 0.000868 wd 0.0500 time 0.5621 (0.5919) data time 0.0008 (0.0032) model time 0.5614 (0.5804) loss 8.8539 (7.7526) grad_norm 2.3455 (2.1836) loss_scale 2048.0000 (1064.7562) mem 22339MB +[2024-07-25 02:08:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][210/625] eta 0:04:05 lr 0.000868 wd 0.0500 time 0.5636 (0.5914) data time 0.0006 (0.0031) model time 0.5629 (0.5805) loss 7.2288 (7.7355) grad_norm 1.8679 (2.1830) loss_scale 2048.0000 (1111.3555) mem 22339MB +[2024-07-25 02:09:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][220/625] eta 0:04:00 lr 0.000868 wd 0.0500 time 0.7419 (0.5932) data time 0.0007 (0.0030) model time 0.7412 (0.5834) loss 5.8199 (7.7188) grad_norm 1.7789 (2.1714) loss_scale 2048.0000 (1153.7376) mem 22339MB +[2024-07-25 02:09:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][230/625] eta 0:03:55 lr 0.000868 wd 0.0500 time 0.7090 (0.5952) data time 0.0007 (0.0029) model time 0.7082 (0.5864) loss 9.1893 (7.7108) grad_norm 1.7089 (2.1589) loss_scale 2048.0000 (1192.4502) mem 22339MB +[2024-07-25 02:09:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][240/625] eta 0:03:49 lr 0.000867 wd 0.0500 time 0.7250 (0.5973) data time 0.0006 (0.0028) model time 0.7244 (0.5895) loss 7.8686 (7.7055) grad_norm 4.7550 (2.1744) loss_scale 2048.0000 (1227.9502) mem 22339MB +[2024-07-25 02:09:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][250/625] eta 0:03:43 lr 0.000867 wd 0.0500 time 0.5632 (0.5971) data time 0.0008 (0.0027) model time 0.5624 (0.5897) loss 8.6364 (7.7074) grad_norm 2.4422 (2.1909) loss_scale 2048.0000 (1260.6215) mem 22339MB +[2024-07-25 02:09:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][260/625] eta 0:03:37 lr 0.000867 wd 0.0500 time 0.5630 (0.5963) data time 0.0008 (0.0027) model time 0.5621 (0.5890) loss 8.3466 (7.7127) grad_norm 2.9022 (2.1955) loss_scale 2048.0000 (1290.7893) mem 22339MB +[2024-07-25 02:09:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][270/625] eta 0:03:31 lr 0.000867 wd 0.0500 time 0.5605 (0.5957) data time 0.0006 (0.0026) model time 0.5599 (0.5885) loss 9.4803 (7.7102) grad_norm 2.3558 (2.1923) loss_scale 2048.0000 (1318.7306) mem 22339MB +[2024-07-25 02:09:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][280/625] eta 0:03:25 lr 0.000867 wd 0.0500 time 0.5636 (0.5952) data time 0.0006 (0.0025) model time 0.5630 (0.5882) loss 7.0361 (7.7111) grad_norm 2.4374 (2.2009) loss_scale 2048.0000 (1344.6833) mem 22339MB +[2024-07-25 02:09:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][290/625] eta 0:03:19 lr 0.000867 wd 0.0500 time 0.5612 (0.5950) data time 0.0008 (0.0025) model time 0.5604 (0.5882) loss 6.7988 (7.7124) grad_norm 1.8300 (2.1937) loss_scale 2048.0000 (1368.8522) mem 22339MB +[2024-07-25 02:09:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][300/625] eta 0:03:13 lr 0.000867 wd 0.0500 time 0.5641 (0.5946) data time 0.0010 (0.0025) model time 0.5630 (0.5879) loss 8.5746 (7.7247) grad_norm 2.5480 (2.1950) loss_scale 2048.0000 (1391.4153) mem 22339MB +[2024-07-25 02:09:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][310/625] eta 0:03:07 lr 0.000867 wd 0.0500 time 0.5636 (0.5942) data time 0.0006 (0.0024) model time 0.5629 (0.5876) loss 7.8491 (7.7165) grad_norm 1.6109 (2.1894) loss_scale 2048.0000 (1412.5273) mem 22339MB +[2024-07-25 02:10:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][320/625] eta 0:03:01 lr 0.000867 wd 0.0500 time 0.5616 (0.5937) data time 0.0006 (0.0024) model time 0.5609 (0.5873) loss 8.4805 (7.7036) grad_norm 2.0397 (2.1828) loss_scale 2048.0000 (1432.3240) mem 22339MB +[2024-07-25 02:10:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][330/625] eta 0:02:54 lr 0.000867 wd 0.0500 time 0.5627 (0.5932) data time 0.0006 (0.0023) model time 0.5621 (0.5869) loss 9.0256 (7.7021) grad_norm 1.5907 (2.1933) loss_scale 2048.0000 (1450.9245) mem 22339MB +[2024-07-25 02:10:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][340/625] eta 0:02:48 lr 0.000866 wd 0.0500 time 0.5659 (0.5928) data time 0.0006 (0.0023) model time 0.5653 (0.5866) loss 7.0852 (7.6916) grad_norm 1.9630 (2.2059) loss_scale 2048.0000 (1468.4340) mem 22339MB +[2024-07-25 02:10:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][350/625] eta 0:02:42 lr 0.000866 wd 0.0500 time 0.5623 (0.5924) data time 0.0008 (0.0022) model time 0.5616 (0.5863) loss 8.1322 (7.6976) grad_norm 3.4083 (2.2185) loss_scale 2048.0000 (1484.9459) mem 22339MB +[2024-07-25 02:10:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][360/625] eta 0:02:36 lr 0.000866 wd 0.0500 time 0.5684 (0.5920) data time 0.0009 (0.0022) model time 0.5675 (0.5860) loss 8.1019 (7.6866) grad_norm 1.7194 (2.2137) loss_scale 2048.0000 (1500.5429) mem 22339MB +[2024-07-25 02:10:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][370/625] eta 0:02:30 lr 0.000866 wd 0.0500 time 0.5768 (0.5917) data time 0.0006 (0.0022) model time 0.5762 (0.5858) loss 7.1060 (7.6931) grad_norm 3.1444 (2.2094) loss_scale 2048.0000 (1515.2992) mem 22339MB +[2024-07-25 02:10:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][380/625] eta 0:02:24 lr 0.000866 wd 0.0500 time 0.5704 (0.5913) data time 0.0006 (0.0021) model time 0.5699 (0.5855) loss 8.3713 (7.6962) grad_norm 1.7904 (2.2025) loss_scale 2048.0000 (1529.2808) mem 22339MB +[2024-07-25 02:10:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][390/625] eta 0:02:18 lr 0.000866 wd 0.0500 time 0.5710 (0.5910) data time 0.0008 (0.0021) model time 0.5702 (0.5853) loss 7.7362 (7.6904) grad_norm 2.8859 (2.2002) loss_scale 2048.0000 (1542.5473) mem 22339MB +[2024-07-25 02:10:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][400/625] eta 0:02:12 lr 0.000866 wd 0.0500 time 0.5699 (0.5907) data time 0.0006 (0.0021) model time 0.5693 (0.5851) loss 8.2593 (7.7000) grad_norm 1.9667 (2.1962) loss_scale 2048.0000 (1555.1521) mem 22339MB +[2024-07-25 02:10:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][410/625] eta 0:02:06 lr 0.000866 wd 0.0500 time 0.5638 (0.5904) data time 0.0009 (0.0020) model time 0.5628 (0.5849) loss 7.1619 (7.6990) grad_norm 1.6224 (2.1906) loss_scale 2048.0000 (1567.1436) mem 22339MB +[2024-07-25 02:11:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][420/625] eta 0:02:00 lr 0.000866 wd 0.0500 time 0.5680 (0.5901) data time 0.0007 (0.0020) model time 0.5674 (0.5846) loss 9.7700 (7.6877) grad_norm 2.5211 (2.1911) loss_scale 2048.0000 (1578.5653) mem 22339MB +[2024-07-25 02:11:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][430/625] eta 0:01:55 lr 0.000866 wd 0.0500 time 0.5687 (0.5898) data time 0.0008 (0.0020) model time 0.5678 (0.5844) loss 6.6328 (7.6900) grad_norm 2.8395 (2.1967) loss_scale 2048.0000 (1589.4571) mem 22339MB +[2024-07-25 02:11:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][440/625] eta 0:01:49 lr 0.000866 wd 0.0500 time 0.6949 (0.5903) data time 0.0006 (0.0020) model time 0.6943 (0.5852) loss 6.0579 (7.6815) grad_norm 1.6246 (2.2068) loss_scale 2048.0000 (1599.8549) mem 22339MB +[2024-07-25 02:11:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][450/625] eta 0:01:43 lr 0.000865 wd 0.0500 time 0.7520 (0.5917) data time 0.0006 (0.0019) model time 0.7514 (0.5869) loss 8.5256 (7.6868) grad_norm 1.8410 (2.2015) loss_scale 2048.0000 (1609.7916) mem 22339MB +[2024-07-25 02:11:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][460/625] eta 0:01:37 lr 0.000865 wd 0.0500 time 0.5648 (0.5924) data time 0.0010 (0.0019) model time 0.5638 (0.5877) loss 7.5393 (7.6836) grad_norm 2.2093 (2.1960) loss_scale 2048.0000 (1619.2972) mem 22339MB +[2024-07-25 02:11:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][470/625] eta 0:01:31 lr 0.000865 wd 0.0500 time 0.5705 (0.5930) data time 0.0009 (0.0019) model time 0.5697 (0.5885) loss 8.8011 (7.6864) grad_norm 1.8869 (2.2067) loss_scale 2048.0000 (1628.3992) mem 22339MB +[2024-07-25 02:11:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][480/625] eta 0:01:25 lr 0.000865 wd 0.0500 time 0.5631 (0.5927) data time 0.0006 (0.0019) model time 0.5625 (0.5882) loss 7.3007 (7.6887) grad_norm 2.2228 (2.2187) loss_scale 2048.0000 (1637.1227) mem 22339MB +[2024-07-25 02:11:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][490/625] eta 0:01:19 lr 0.000865 wd 0.0500 time 0.5640 (0.5923) data time 0.0009 (0.0019) model time 0.5631 (0.5879) loss 7.9665 (7.6905) grad_norm 3.0047 (2.2234) loss_scale 2048.0000 (1645.4908) mem 22339MB +[2024-07-25 02:11:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][500/625] eta 0:01:14 lr 0.000865 wd 0.0500 time 0.5685 (0.5920) data time 0.0008 (0.0018) model time 0.5677 (0.5876) loss 6.1384 (7.6785) grad_norm 2.1765 (2.2215) loss_scale 2048.0000 (1653.5250) mem 22339MB +[2024-07-25 02:11:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][510/625] eta 0:01:08 lr 0.000865 wd 0.0500 time 0.5641 (0.5917) data time 0.0008 (0.0018) model time 0.5633 (0.5874) loss 7.6309 (7.6740) grad_norm 1.6567 (2.2128) loss_scale 2048.0000 (1661.2446) mem 22339MB +[2024-07-25 02:12:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][520/625] eta 0:01:02 lr 0.000865 wd 0.0500 time 0.5638 (0.5915) data time 0.0007 (0.0018) model time 0.5631 (0.5871) loss 6.4482 (7.6754) grad_norm 2.0243 (2.2085) loss_scale 2048.0000 (1668.6679) mem 22339MB +[2024-07-25 02:12:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][530/625] eta 0:00:56 lr 0.000865 wd 0.0500 time 0.5662 (0.5912) data time 0.0006 (0.0018) model time 0.5656 (0.5870) loss 8.7895 (7.6757) grad_norm 2.8473 (2.2048) loss_scale 2048.0000 (1675.8117) mem 22339MB +[2024-07-25 02:12:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][540/625] eta 0:00:50 lr 0.000865 wd 0.0500 time 0.5703 (0.5910) data time 0.0006 (0.0018) model time 0.5697 (0.5868) loss 8.8046 (7.6775) grad_norm 2.2600 (2.2036) loss_scale 2048.0000 (1682.6913) mem 22339MB +[2024-07-25 02:12:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][550/625] eta 0:00:44 lr 0.000864 wd 0.0500 time 0.5647 (0.5907) data time 0.0007 (0.0017) model time 0.5641 (0.5866) loss 8.2479 (7.6701) grad_norm 2.2921 (2.1994) loss_scale 2048.0000 (1689.3212) mem 22339MB +[2024-07-25 02:12:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][560/625] eta 0:00:38 lr 0.000864 wd 0.0500 time 0.5652 (0.5905) data time 0.0007 (0.0017) model time 0.5645 (0.5864) loss 7.5495 (7.6645) grad_norm 2.5772 (2.2013) loss_scale 2048.0000 (1695.7148) mem 22339MB +[2024-07-25 02:12:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][570/625] eta 0:00:32 lr 0.000864 wd 0.0500 time 0.5687 (0.5903) data time 0.0006 (0.0017) model time 0.5682 (0.5862) loss 8.7049 (7.6721) grad_norm 2.1457 (2.2025) loss_scale 2048.0000 (1701.8844) mem 22339MB +[2024-07-25 02:12:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][580/625] eta 0:00:26 lr 0.000864 wd 0.0500 time 0.5651 (0.5900) data time 0.0007 (0.0017) model time 0.5644 (0.5860) loss 6.8583 (7.6694) grad_norm 3.2070 (2.2096) loss_scale 2048.0000 (1707.8417) mem 22339MB +[2024-07-25 02:12:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][590/625] eta 0:00:20 lr 0.000864 wd 0.0500 time 0.5710 (0.5898) data time 0.0007 (0.0017) model time 0.5703 (0.5858) loss 6.6573 (7.6579) grad_norm 2.6738 (2.2100) loss_scale 2048.0000 (1713.5973) mem 22339MB +[2024-07-25 02:12:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][600/625] eta 0:00:14 lr 0.000864 wd 0.0500 time 0.5627 (0.5898) data time 0.0006 (0.0017) model time 0.5621 (0.5858) loss 7.0547 (7.6593) grad_norm 4.0653 (2.2094) loss_scale 2048.0000 (1719.1614) mem 22339MB +[2024-07-25 02:12:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][610/625] eta 0:00:08 lr 0.000864 wd 0.0500 time 0.5638 (0.5899) data time 0.0004 (0.0017) model time 0.5634 (0.5859) loss 8.6744 (7.6553) grad_norm 2.3449 (2.2155) loss_scale 2048.0000 (1724.5434) mem 22339MB +[2024-07-25 02:13:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [119/300][620/625] eta 0:00:02 lr 0.000864 wd 0.0500 time 0.5624 (0.5900) data time 0.0006 (0.0017) model time 0.5618 (0.5860) loss 7.8686 (7.6552) grad_norm 1.5623 (2.2160) loss_scale 2048.0000 (1729.7520) mem 22339MB +[2024-07-25 02:13:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 119 training takes 0:06:08 +[2024-07-25 02:13:02 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 02:13:04 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 02:13:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.493 (0.493) Loss 0.5312 (0.5312) Acc@1 89.355 (89.355) Acc@5 98.486 (98.486) Mem 22339MB +[2024-07-25 02:13:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.164) Loss 0.8921 (0.6773) Acc@1 79.395 (85.289) Acc@5 95.508 (97.519) Mem 22339MB +[2024-07-25 02:13:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.146) Loss 0.9526 (0.8008) Acc@1 76.611 (81.968) Acc@5 94.580 (96.110) Mem 22339MB +[2024-07-25 02:13:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.686 Acc@5 96.115 +[2024-07-25 02:13:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.7% +[2024-07-25 02:13:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.760 (0.760) Loss 0.5034 (0.5034) Acc@1 89.307 (89.307) Acc@5 98.682 (98.682) Mem 22339MB +[2024-07-25 02:13:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.184) Loss 0.8086 (0.6364) Acc@1 81.055 (86.213) Acc@5 96.143 (97.701) Mem 22339MB +[2024-07-25 02:13:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.157) Loss 0.9233 (0.7474) Acc@1 76.465 (82.906) Acc@5 95.264 (96.526) Mem 22339MB +[2024-07-25 02:13:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.578 Acc@5 96.539 +[2024-07-25 02:13:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.6% +[2024-07-25 02:13:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.58% +[2024-07-25 02:13:11 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 02:13:13 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 02:13:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][0/625] eta 0:08:51 lr 0.000864 wd 0.0500 time 0.8510 (0.8510) data time 0.3321 (0.3321) model time 0.0000 (0.0000) loss 5.7167 (5.7167) grad_norm 1.9691 (1.9691) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:13:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][10/625] eta 0:06:13 lr 0.000864 wd 0.0500 time 0.5613 (0.6073) data time 0.0008 (0.0322) model time 0.0000 (0.0000) loss 7.7231 (7.2754) grad_norm 2.6033 (2.0300) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:13:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][20/625] eta 0:05:59 lr 0.000864 wd 0.0500 time 0.5652 (0.5936) data time 0.0008 (0.0173) model time 0.0000 (0.0000) loss 9.1689 (7.6156) grad_norm 1.8057 (2.0060) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:13:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][30/625] eta 0:05:52 lr 0.000863 wd 0.0500 time 0.5665 (0.5929) data time 0.0008 (0.0120) model time 0.0000 (0.0000) loss 8.9382 (7.7750) grad_norm 1.6641 (2.0764) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:13:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][40/625] eta 0:05:52 lr 0.000863 wd 0.0500 time 0.5672 (0.6030) data time 0.0006 (0.0093) model time 0.0000 (0.0000) loss 7.6406 (7.8261) grad_norm 2.6278 (2.0632) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:13:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][50/625] eta 0:05:50 lr 0.000863 wd 0.0500 time 0.7173 (0.6099) data time 0.0006 (0.0076) model time 0.0000 (0.0000) loss 7.6195 (7.7999) grad_norm 1.6708 (2.0616) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:13:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][60/625] eta 0:05:44 lr 0.000863 wd 0.0500 time 0.5703 (0.6100) data time 0.0008 (0.0065) model time 0.5695 (0.6099) loss 7.8606 (7.7374) grad_norm 1.8794 (2.0878) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:13:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][70/625] eta 0:05:38 lr 0.000863 wd 0.0500 time 0.5656 (0.6090) data time 0.0008 (0.0057) model time 0.5648 (0.6061) loss 6.8044 (7.7234) grad_norm 1.7793 (2.0784) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:14:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][80/625] eta 0:05:29 lr 0.000863 wd 0.0500 time 0.5633 (0.6050) data time 0.0009 (0.0051) model time 0.5624 (0.5960) loss 6.7141 (7.7057) grad_norm 3.1488 (2.1468) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:14:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][90/625] eta 0:05:22 lr 0.000863 wd 0.0500 time 0.5677 (0.6019) data time 0.0008 (0.0047) model time 0.5669 (0.5909) loss 6.5752 (7.6823) grad_norm 2.5386 (2.2306) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:14:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][100/625] eta 0:05:14 lr 0.000863 wd 0.0500 time 0.5626 (0.5995) data time 0.0008 (0.0043) model time 0.5618 (0.5880) loss 7.4076 (7.6358) grad_norm 3.1344 (2.2428) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:14:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][110/625] eta 0:05:07 lr 0.000863 wd 0.0500 time 0.5633 (0.5974) data time 0.0009 (0.0040) model time 0.5624 (0.5859) loss 8.0619 (7.6382) grad_norm 1.9025 (2.2468) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:14:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][120/625] eta 0:05:00 lr 0.000863 wd 0.0500 time 0.5630 (0.5957) data time 0.0009 (0.0037) model time 0.5621 (0.5846) loss 9.2240 (7.6756) grad_norm 2.2768 (2.2220) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:14:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][130/625] eta 0:04:54 lr 0.000862 wd 0.0500 time 0.5666 (0.5942) data time 0.0008 (0.0035) model time 0.5658 (0.5834) loss 5.9263 (7.6749) grad_norm 2.1707 (2.2426) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:14:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][140/625] eta 0:04:47 lr 0.000862 wd 0.0500 time 0.5654 (0.5929) data time 0.0006 (0.0033) model time 0.5647 (0.5825) loss 6.9725 (7.6876) grad_norm 2.0271 (2.2330) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:14:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][150/625] eta 0:04:41 lr 0.000862 wd 0.0500 time 0.5609 (0.5917) data time 0.0006 (0.0031) model time 0.5603 (0.5816) loss 8.4295 (7.6915) grad_norm 3.3953 (2.2319) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:14:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][160/625] eta 0:04:34 lr 0.000862 wd 0.0500 time 0.5637 (0.5909) data time 0.0008 (0.0030) model time 0.5629 (0.5813) loss 8.8542 (7.6813) grad_norm 1.9390 (2.2492) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:14:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][170/625] eta 0:04:28 lr 0.000862 wd 0.0500 time 0.5656 (0.5901) data time 0.0006 (0.0029) model time 0.5650 (0.5809) loss 6.4875 (7.6735) grad_norm 2.1038 (2.2354) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:14:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][180/625] eta 0:04:22 lr 0.000862 wd 0.0500 time 0.5645 (0.5895) data time 0.0008 (0.0028) model time 0.5638 (0.5807) loss 7.0070 (7.6889) grad_norm 1.3907 (2.2396) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:15:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][190/625] eta 0:04:16 lr 0.000862 wd 0.0500 time 0.5655 (0.5888) data time 0.0006 (0.0027) model time 0.5648 (0.5803) loss 7.1016 (7.6788) grad_norm 1.8503 (2.2450) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:15:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][200/625] eta 0:04:10 lr 0.000862 wd 0.0500 time 0.5673 (0.5886) data time 0.0006 (0.0026) model time 0.5667 (0.5805) loss 7.4919 (7.6929) grad_norm 1.9182 (2.2371) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:15:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][210/625] eta 0:04:04 lr 0.000862 wd 0.0500 time 0.5615 (0.5881) data time 0.0007 (0.0025) model time 0.5609 (0.5803) loss 5.3291 (7.6504) grad_norm 1.7204 (2.2220) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:15:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][220/625] eta 0:03:58 lr 0.000862 wd 0.0500 time 0.5669 (0.5877) data time 0.0007 (0.0024) model time 0.5662 (0.5801) loss 7.5439 (7.6572) grad_norm 1.8141 (2.1998) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:15:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][230/625] eta 0:03:51 lr 0.000862 wd 0.0500 time 0.5718 (0.5872) data time 0.0006 (0.0023) model time 0.5713 (0.5800) loss 6.0665 (7.6625) grad_norm 1.4576 (2.1890) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:15:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][240/625] eta 0:03:45 lr 0.000861 wd 0.0500 time 0.5623 (0.5870) data time 0.0009 (0.0023) model time 0.5615 (0.5800) loss 9.0964 (7.6693) grad_norm 2.4886 (2.2001) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:15:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][250/625] eta 0:03:40 lr 0.000861 wd 0.0500 time 0.5618 (0.5880) data time 0.0008 (0.0022) model time 0.5610 (0.5816) loss 8.5242 (7.6642) grad_norm 2.1969 (2.2118) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:15:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][260/625] eta 0:03:35 lr 0.000861 wd 0.0500 time 0.5609 (0.5894) data time 0.0008 (0.0022) model time 0.5601 (0.5836) loss 7.9989 (7.6745) grad_norm 1.8389 (2.2004) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:15:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][270/625] eta 0:03:30 lr 0.000861 wd 0.0500 time 0.6950 (0.5919) data time 0.0008 (0.0022) model time 0.6942 (0.5869) loss 9.0906 (7.6827) grad_norm 1.9879 (2.1971) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:15:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][280/625] eta 0:03:24 lr 0.000861 wd 0.0500 time 0.5646 (0.5932) data time 0.0006 (0.0021) model time 0.5640 (0.5886) loss 7.7932 (7.6797) grad_norm 2.0561 (2.1954) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:16:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][290/625] eta 0:03:18 lr 0.000861 wd 0.0500 time 0.5688 (0.5939) data time 0.0009 (0.0021) model time 0.5679 (0.5896) loss 8.9781 (7.6743) grad_norm 1.9641 (2.1959) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:16:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][300/625] eta 0:03:12 lr 0.000861 wd 0.0500 time 0.5667 (0.5936) data time 0.0006 (0.0020) model time 0.5660 (0.5894) loss 8.9000 (7.6703) grad_norm 2.0970 (2.2075) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:16:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][310/625] eta 0:03:06 lr 0.000861 wd 0.0500 time 0.5641 (0.5932) data time 0.0008 (0.0020) model time 0.5633 (0.5891) loss 7.2962 (7.6729) grad_norm 2.0038 (2.2216) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:16:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][320/625] eta 0:03:00 lr 0.000861 wd 0.0500 time 0.5660 (0.5928) data time 0.0006 (0.0020) model time 0.5654 (0.5887) loss 7.9573 (7.6869) grad_norm 1.8361 (2.2162) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:16:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][330/625] eta 0:02:54 lr 0.000861 wd 0.0500 time 0.5664 (0.5923) data time 0.0006 (0.0019) model time 0.5658 (0.5882) loss 7.4681 (7.6873) grad_norm 1.8971 (2.2137) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:16:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][340/625] eta 0:02:48 lr 0.000860 wd 0.0500 time 0.5627 (0.5920) data time 0.0008 (0.0019) model time 0.5619 (0.5879) loss 6.9576 (7.6810) grad_norm 2.1174 (2.2263) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:16:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][350/625] eta 0:02:42 lr 0.000860 wd 0.0500 time 0.5685 (0.5916) data time 0.0009 (0.0019) model time 0.5676 (0.5876) loss 7.8138 (7.7000) grad_norm 1.8825 (2.2207) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:16:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][360/625] eta 0:02:36 lr 0.000860 wd 0.0500 time 0.5622 (0.5912) data time 0.0007 (0.0018) model time 0.5615 (0.5872) loss 7.8441 (7.7075) grad_norm 2.0214 (2.2134) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:16:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][370/625] eta 0:02:30 lr 0.000860 wd 0.0500 time 0.5728 (0.5909) data time 0.0008 (0.0018) model time 0.5720 (0.5869) loss 8.3361 (7.7018) grad_norm 2.5178 (2.2080) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:16:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][380/625] eta 0:02:24 lr 0.000860 wd 0.0500 time 0.5643 (0.5905) data time 0.0009 (0.0018) model time 0.5634 (0.5866) loss 5.8415 (7.6977) grad_norm 1.7641 (2.1955) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:17:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][390/625] eta 0:02:18 lr 0.000860 wd 0.0500 time 0.5628 (0.5903) data time 0.0008 (0.0018) model time 0.5620 (0.5864) loss 6.5851 (7.6764) grad_norm 1.4624 (2.1880) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:17:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][400/625] eta 0:02:12 lr 0.000860 wd 0.0500 time 0.5657 (0.5900) data time 0.0008 (0.0017) model time 0.5649 (0.5862) loss 8.0541 (7.6811) grad_norm 1.9133 (2.1827) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:17:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][410/625] eta 0:02:06 lr 0.000860 wd 0.0500 time 0.5645 (0.5897) data time 0.0007 (0.0017) model time 0.5638 (0.5860) loss 7.4584 (7.6776) grad_norm 1.8446 (2.1817) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:17:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][420/625] eta 0:02:00 lr 0.000860 wd 0.0500 time 0.5673 (0.5898) data time 0.0006 (0.0017) model time 0.5667 (0.5861) loss 8.4828 (7.6842) grad_norm 1.9930 (2.1764) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:17:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][430/625] eta 0:01:54 lr 0.000860 wd 0.0500 time 0.5709 (0.5897) data time 0.0009 (0.0017) model time 0.5700 (0.5861) loss 9.0447 (7.6865) grad_norm 1.7621 (2.1770) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:17:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][440/625] eta 0:01:49 lr 0.000859 wd 0.0500 time 0.5681 (0.5894) data time 0.0009 (0.0017) model time 0.5673 (0.5858) loss 7.6471 (7.6881) grad_norm 1.5376 (2.1785) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:17:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][450/625] eta 0:01:43 lr 0.000859 wd 0.0500 time 0.5690 (0.5891) data time 0.0007 (0.0016) model time 0.5683 (0.5856) loss 6.6832 (7.6788) grad_norm 2.0408 (2.1774) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:17:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][460/625] eta 0:01:37 lr 0.000859 wd 0.0500 time 0.5732 (0.5889) data time 0.0006 (0.0016) model time 0.5726 (0.5854) loss 8.5263 (7.6897) grad_norm 2.7636 (2.1837) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:17:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][470/625] eta 0:01:31 lr 0.000859 wd 0.0500 time 0.5683 (0.5890) data time 0.0007 (0.0016) model time 0.5676 (0.5856) loss 7.5330 (7.6881) grad_norm 2.6931 (2.1830) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:17:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][480/625] eta 0:01:25 lr 0.000859 wd 0.0500 time 0.5682 (0.5899) data time 0.0008 (0.0016) model time 0.5674 (0.5866) loss 8.5346 (7.6955) grad_norm 2.0260 (2.1895) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:18:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][490/625] eta 0:01:19 lr 0.000859 wd 0.0500 time 0.7572 (0.5913) data time 0.0009 (0.0016) model time 0.7564 (0.5882) loss 7.5071 (7.6908) grad_norm 2.7138 (2.1897) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:18:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][500/625] eta 0:01:13 lr 0.000859 wd 0.0500 time 0.7394 (0.5920) data time 0.0007 (0.0016) model time 0.7387 (0.5890) loss 6.4249 (7.6946) grad_norm 2.0811 (2.1896) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:18:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][510/625] eta 0:01:08 lr 0.000859 wd 0.0500 time 0.5645 (0.5919) data time 0.0008 (0.0016) model time 0.5637 (0.5890) loss 8.5517 (7.6883) grad_norm 1.7412 (2.1817) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:18:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][520/625] eta 0:01:02 lr 0.000859 wd 0.0500 time 0.5629 (0.5916) data time 0.0006 (0.0016) model time 0.5623 (0.5887) loss 8.2154 (7.6807) grad_norm 2.4344 (2.1805) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:18:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][530/625] eta 0:00:56 lr 0.000859 wd 0.0500 time 0.5706 (0.5913) data time 0.0006 (0.0016) model time 0.5700 (0.5884) loss 7.0604 (7.6749) grad_norm 2.3445 (2.1804) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:18:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][540/625] eta 0:00:50 lr 0.000859 wd 0.0500 time 0.5699 (0.5910) data time 0.0007 (0.0016) model time 0.5692 (0.5881) loss 8.3669 (7.6775) grad_norm 2.6803 (2.1824) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:18:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][550/625] eta 0:00:44 lr 0.000858 wd 0.0500 time 0.5679 (0.5908) data time 0.0006 (0.0016) model time 0.5673 (0.5879) loss 7.5265 (7.6689) grad_norm 2.7676 (2.1862) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:18:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][560/625] eta 0:00:38 lr 0.000858 wd 0.0500 time 0.5674 (0.5905) data time 0.0006 (0.0015) model time 0.5668 (0.5877) loss 7.5803 (7.6701) grad_norm 2.6221 (2.1913) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:18:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][570/625] eta 0:00:32 lr 0.000858 wd 0.0500 time 0.5697 (0.5903) data time 0.0008 (0.0015) model time 0.5689 (0.5874) loss 6.9765 (7.6717) grad_norm 2.1927 (2.1947) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:18:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][580/625] eta 0:00:26 lr 0.000858 wd 0.0500 time 0.5655 (0.5900) data time 0.0008 (0.0015) model time 0.5646 (0.5871) loss 5.7830 (7.6651) grad_norm 1.7800 (2.1923) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:19:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][590/625] eta 0:00:20 lr 0.000858 wd 0.0500 time 0.5713 (0.5897) data time 0.0008 (0.0015) model time 0.5706 (0.5869) loss 7.1641 (7.6662) grad_norm 1.9456 (2.1897) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:19:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][600/625] eta 0:00:14 lr 0.000858 wd 0.0500 time 0.5660 (0.5895) data time 0.0008 (0.0015) model time 0.5652 (0.5867) loss 8.6145 (7.6742) grad_norm 2.2886 (2.1859) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:19:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][610/625] eta 0:00:08 lr 0.000858 wd 0.0500 time 0.5634 (0.5893) data time 0.0006 (0.0015) model time 0.5628 (0.5865) loss 8.7546 (7.6786) grad_norm 2.0213 (2.1927) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:19:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [120/300][620/625] eta 0:00:02 lr 0.000858 wd 0.0500 time 0.5685 (0.5891) data time 0.0005 (0.0015) model time 0.5680 (0.5863) loss 7.1078 (7.6753) grad_norm 1.9007 (2.1903) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:19:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 120 training takes 0:06:08 +[2024-07-25 02:19:21 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 02:19:23 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 02:19:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.475 (0.475) Loss 0.5488 (0.5488) Acc@1 89.551 (89.551) Acc@5 98.438 (98.438) Mem 22339MB +[2024-07-25 02:19:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.158) Loss 0.8892 (0.6707) Acc@1 78.711 (85.547) Acc@5 95.410 (97.483) Mem 22339MB +[2024-07-25 02:19:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.9971 (0.7923) Acc@1 75.977 (82.203) Acc@5 94.922 (96.203) Mem 22339MB +[2024-07-25 02:19:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.902 Acc@5 96.233 +[2024-07-25 02:19:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.9% +[2024-07-25 02:19:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 81.90% +[2024-07-25 02:19:26 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 02:19:28 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 02:19:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.466 (0.466) Loss 0.5029 (0.5029) Acc@1 89.258 (89.258) Acc@5 98.682 (98.682) Mem 22339MB +[2024-07-25 02:19:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.156) Loss 0.8081 (0.6356) Acc@1 81.006 (86.208) Acc@5 96.143 (97.687) Mem 22339MB +[2024-07-25 02:19:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9229 (0.7465) Acc@1 76.611 (82.899) Acc@5 95.312 (96.515) Mem 22339MB +[2024-07-25 02:19:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.570 Acc@5 96.527 +[2024-07-25 02:19:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.6% +[2024-07-25 02:19:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][0/625] eta 0:15:08 lr 0.000858 wd 0.0500 time 1.4536 (1.4536) data time 0.6290 (0.6290) model time 0.0000 (0.0000) loss 8.8535 (8.8535) grad_norm 1.6344 (1.6344) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:19:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][10/625] eta 0:06:42 lr 0.000858 wd 0.0500 time 0.5740 (0.6548) data time 0.0006 (0.0579) model time 0.0000 (0.0000) loss 6.6115 (7.2855) grad_norm 1.9615 (2.2176) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:19:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][20/625] eta 0:06:13 lr 0.000858 wd 0.0500 time 0.5785 (0.6168) data time 0.0006 (0.0307) model time 0.0000 (0.0000) loss 7.4832 (7.1677) grad_norm 3.1795 (2.2803) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:19:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][30/625] eta 0:05:58 lr 0.000857 wd 0.0500 time 0.5783 (0.6034) data time 0.0006 (0.0210) model time 0.0000 (0.0000) loss 7.9274 (7.4311) grad_norm 1.8578 (2.3131) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:19:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][40/625] eta 0:05:48 lr 0.000857 wd 0.0500 time 0.5770 (0.5961) data time 0.0006 (0.0161) model time 0.0000 (0.0000) loss 8.0880 (7.5547) grad_norm 1.7484 (2.2898) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:20:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][50/625] eta 0:05:40 lr 0.000857 wd 0.0500 time 0.5788 (0.5922) data time 0.0008 (0.0131) model time 0.0000 (0.0000) loss 8.5030 (7.6154) grad_norm 1.9862 (2.2796) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:20:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][60/625] eta 0:05:33 lr 0.000857 wd 0.0500 time 0.5783 (0.5896) data time 0.0008 (0.0111) model time 0.5775 (0.5756) loss 8.1360 (7.5956) grad_norm 2.1561 (2.2630) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:20:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][70/625] eta 0:05:29 lr 0.000857 wd 0.0500 time 0.7209 (0.5931) data time 0.0008 (0.0096) model time 0.7201 (0.5945) loss 6.1445 (7.5826) grad_norm 2.3992 (2.3049) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:20:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][80/625] eta 0:05:24 lr 0.000857 wd 0.0500 time 0.7260 (0.5951) data time 0.0009 (0.0085) model time 0.7251 (0.5992) loss 6.6471 (7.6007) grad_norm 1.9576 (2.2674) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:20:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][90/625] eta 0:05:22 lr 0.000857 wd 0.0500 time 0.7687 (0.6035) data time 0.0007 (0.0077) model time 0.7681 (0.6171) loss 7.6444 (7.6273) grad_norm 1.6864 (2.2435) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:20:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][100/625] eta 0:05:17 lr 0.000857 wd 0.0500 time 0.5702 (0.6054) data time 0.0009 (0.0070) model time 0.5693 (0.6179) loss 8.7888 (7.6596) grad_norm 1.5795 (2.2228) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:20:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][110/625] eta 0:05:11 lr 0.000857 wd 0.0500 time 0.5895 (0.6043) data time 0.0006 (0.0065) model time 0.5889 (0.6137) loss 7.5928 (7.6670) grad_norm 2.8084 (2.2129) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:20:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][120/625] eta 0:05:03 lr 0.000857 wd 0.0500 time 0.5719 (0.6019) data time 0.0007 (0.0060) model time 0.5713 (0.6082) loss 7.5971 (7.6785) grad_norm 1.9029 (2.2278) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:20:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][130/625] eta 0:04:57 lr 0.000856 wd 0.0500 time 0.5764 (0.6000) data time 0.0006 (0.0056) model time 0.5758 (0.6041) loss 5.7170 (7.6451) grad_norm 1.9212 (2.2247) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:20:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][140/625] eta 0:04:50 lr 0.000856 wd 0.0500 time 0.5768 (0.5982) data time 0.0008 (0.0053) model time 0.5760 (0.6008) loss 8.3230 (7.6397) grad_norm 2.5716 (2.2132) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:21:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][150/625] eta 0:04:43 lr 0.000856 wd 0.0500 time 0.5749 (0.5968) data time 0.0008 (0.0050) model time 0.5741 (0.5983) loss 9.3070 (7.6630) grad_norm 2.0657 (2.2116) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:21:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][160/625] eta 0:04:36 lr 0.000856 wd 0.0500 time 0.5830 (0.5956) data time 0.0009 (0.0047) model time 0.5821 (0.5963) loss 8.1835 (7.6622) grad_norm 1.8320 (2.1953) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:21:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][170/625] eta 0:04:30 lr 0.000856 wd 0.0500 time 0.5821 (0.5944) data time 0.0006 (0.0045) model time 0.5815 (0.5945) loss 9.0556 (7.6819) grad_norm 1.7234 (2.1928) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:21:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][180/625] eta 0:04:24 lr 0.000856 wd 0.0500 time 0.5962 (0.5935) data time 0.0006 (0.0043) model time 0.5956 (0.5932) loss 6.2142 (7.6724) grad_norm 2.3100 (2.1959) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:21:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][190/625] eta 0:04:18 lr 0.000856 wd 0.0500 time 0.5884 (0.5934) data time 0.0008 (0.0041) model time 0.5876 (0.5930) loss 8.0242 (7.6703) grad_norm 1.7892 (2.1912) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:21:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][200/625] eta 0:04:11 lr 0.000856 wd 0.0500 time 0.5781 (0.5925) data time 0.0006 (0.0040) model time 0.5775 (0.5917) loss 6.4811 (7.6672) grad_norm 2.2635 (2.2133) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:21:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][210/625] eta 0:04:05 lr 0.000856 wd 0.0500 time 0.5786 (0.5917) data time 0.0008 (0.0038) model time 0.5779 (0.5907) loss 6.0900 (7.6497) grad_norm 6.7983 (2.2432) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:21:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][220/625] eta 0:03:59 lr 0.000856 wd 0.0500 time 0.5785 (0.5910) data time 0.0008 (0.0037) model time 0.5777 (0.5898) loss 7.9221 (7.6527) grad_norm 2.9118 (2.2583) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:21:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][230/625] eta 0:03:53 lr 0.000855 wd 0.0500 time 0.5772 (0.5905) data time 0.0008 (0.0036) model time 0.5764 (0.5891) loss 8.5904 (7.6536) grad_norm 3.1019 (2.2627) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:21:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][240/625] eta 0:03:47 lr 0.000855 wd 0.0500 time 0.5750 (0.5899) data time 0.0009 (0.0034) model time 0.5741 (0.5884) loss 7.4357 (7.6650) grad_norm 2.5513 (2.2689) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:21:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][250/625] eta 0:03:41 lr 0.000855 wd 0.0500 time 0.5809 (0.5894) data time 0.0007 (0.0033) model time 0.5802 (0.5878) loss 8.4199 (7.6679) grad_norm 2.6461 (2.2661) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:22:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][260/625] eta 0:03:34 lr 0.000855 wd 0.0500 time 0.5762 (0.5889) data time 0.0008 (0.0032) model time 0.5754 (0.5872) loss 6.3489 (7.6576) grad_norm 2.8462 (2.2595) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:22:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][270/625] eta 0:03:28 lr 0.000855 wd 0.0500 time 0.5770 (0.5884) data time 0.0007 (0.0032) model time 0.5762 (0.5867) loss 7.7358 (7.6561) grad_norm 2.0179 (2.2577) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:22:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][280/625] eta 0:03:23 lr 0.000855 wd 0.0500 time 0.7439 (0.5886) data time 0.0006 (0.0031) model time 0.7433 (0.5869) loss 7.1690 (7.6421) grad_norm 2.1829 (2.2465) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:22:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][290/625] eta 0:03:17 lr 0.000855 wd 0.0500 time 0.5989 (0.5882) data time 0.0006 (0.0030) model time 0.5983 (0.5866) loss 6.7806 (7.6335) grad_norm 1.8348 (2.2504) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:22:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][300/625] eta 0:03:11 lr 0.000855 wd 0.0500 time 0.7364 (0.5902) data time 0.0006 (0.0029) model time 0.7358 (0.5889) loss 8.0045 (7.6297) grad_norm 2.3455 (2.2500) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:22:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][310/625] eta 0:03:06 lr 0.000855 wd 0.0500 time 0.7633 (0.5925) data time 0.0006 (0.0028) model time 0.7627 (0.5917) loss 8.3766 (7.6238) grad_norm 3.2333 (2.2593) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:22:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][320/625] eta 0:03:01 lr 0.000855 wd 0.0500 time 0.5729 (0.5938) data time 0.0007 (0.0028) model time 0.5722 (0.5932) loss 8.5567 (7.6167) grad_norm 2.6786 (2.2605) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:22:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][330/625] eta 0:02:55 lr 0.000855 wd 0.0500 time 0.5769 (0.5933) data time 0.0008 (0.0027) model time 0.5761 (0.5926) loss 8.2389 (7.6324) grad_norm 1.8475 (2.2496) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:22:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][340/625] eta 0:02:48 lr 0.000854 wd 0.0500 time 0.5747 (0.5928) data time 0.0006 (0.0027) model time 0.5741 (0.5920) loss 6.3829 (7.6212) grad_norm 1.6476 (2.2339) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:22:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][350/625] eta 0:02:42 lr 0.000854 wd 0.0500 time 0.5886 (0.5924) data time 0.0008 (0.0026) model time 0.5878 (0.5915) loss 8.9855 (7.6339) grad_norm 2.3023 (2.2217) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:23:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][360/625] eta 0:02:36 lr 0.000854 wd 0.0500 time 0.5914 (0.5920) data time 0.0006 (0.0026) model time 0.5908 (0.5911) loss 8.3283 (7.6450) grad_norm 1.7828 (2.2133) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:23:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][370/625] eta 0:02:30 lr 0.000854 wd 0.0500 time 0.5775 (0.5916) data time 0.0006 (0.0025) model time 0.5769 (0.5906) loss 7.0498 (7.6537) grad_norm 2.0359 (2.2191) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:23:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][380/625] eta 0:02:24 lr 0.000854 wd 0.0500 time 0.5821 (0.5912) data time 0.0008 (0.0025) model time 0.5813 (0.5902) loss 9.0789 (7.6544) grad_norm 1.3950 (2.2223) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:23:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][390/625] eta 0:02:18 lr 0.000854 wd 0.0500 time 0.5797 (0.5908) data time 0.0006 (0.0024) model time 0.5791 (0.5898) loss 7.7051 (7.6602) grad_norm 1.8897 (2.2134) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:23:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][400/625] eta 0:02:12 lr 0.000854 wd 0.0500 time 0.5989 (0.5905) data time 0.0006 (0.0024) model time 0.5982 (0.5894) loss 8.1043 (7.6551) grad_norm 2.3450 (2.2077) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:23:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][410/625] eta 0:02:06 lr 0.000854 wd 0.0500 time 0.5808 (0.5903) data time 0.0007 (0.0024) model time 0.5801 (0.5892) loss 7.0180 (7.6542) grad_norm 1.7969 (2.2051) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:23:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][420/625] eta 0:02:00 lr 0.000854 wd 0.0500 time 0.5777 (0.5900) data time 0.0008 (0.0023) model time 0.5769 (0.5889) loss 7.3915 (7.6597) grad_norm 3.8336 (2.2074) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:23:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][430/625] eta 0:01:54 lr 0.000854 wd 0.0500 time 0.5795 (0.5897) data time 0.0006 (0.0023) model time 0.5789 (0.5885) loss 6.6909 (7.6600) grad_norm 3.0487 (2.2148) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:23:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][440/625] eta 0:01:49 lr 0.000853 wd 0.0500 time 0.5755 (0.5894) data time 0.0006 (0.0023) model time 0.5750 (0.5882) loss 6.2132 (7.6515) grad_norm 2.8403 (2.2115) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:23:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][450/625] eta 0:01:43 lr 0.000853 wd 0.0500 time 0.5896 (0.5892) data time 0.0008 (0.0022) model time 0.5887 (0.5879) loss 6.2360 (7.6543) grad_norm 2.0093 (2.2081) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:24:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][460/625] eta 0:01:37 lr 0.000853 wd 0.0500 time 0.5730 (0.5889) data time 0.0009 (0.0022) model time 0.5721 (0.5876) loss 6.6236 (7.6523) grad_norm 1.6377 (2.2071) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:24:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][470/625] eta 0:01:31 lr 0.000853 wd 0.0500 time 0.6052 (0.5886) data time 0.0008 (0.0022) model time 0.6045 (0.5873) loss 6.1265 (7.6548) grad_norm 1.7814 (2.1961) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:24:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][480/625] eta 0:01:25 lr 0.000853 wd 0.0500 time 0.5777 (0.5884) data time 0.0008 (0.0022) model time 0.5769 (0.5871) loss 6.8701 (7.6445) grad_norm 1.7306 (2.1916) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:24:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][490/625] eta 0:01:19 lr 0.000853 wd 0.0500 time 0.5761 (0.5881) data time 0.0006 (0.0022) model time 0.5755 (0.5868) loss 8.3936 (7.6489) grad_norm 1.5637 (2.1939) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:24:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][500/625] eta 0:01:13 lr 0.000853 wd 0.0500 time 0.5767 (0.5879) data time 0.0006 (0.0021) model time 0.5761 (0.5865) loss 6.3288 (7.6463) grad_norm 2.4968 (2.1916) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:24:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][510/625] eta 0:01:07 lr 0.000853 wd 0.0500 time 0.7375 (0.5883) data time 0.0009 (0.0021) model time 0.7366 (0.5869) loss 8.8150 (7.6531) grad_norm 1.4809 (2.1871) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:24:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][520/625] eta 0:01:01 lr 0.000853 wd 0.0500 time 0.6027 (0.5888) data time 0.0007 (0.0021) model time 0.6021 (0.5876) loss 8.3132 (7.6539) grad_norm 2.3461 (2.1869) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:24:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][530/625] eta 0:00:56 lr 0.000853 wd 0.0500 time 0.5731 (0.5906) data time 0.0008 (0.0021) model time 0.5723 (0.5895) loss 6.3474 (7.6528) grad_norm 2.1972 (2.1828) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:24:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][540/625] eta 0:00:50 lr 0.000852 wd 0.0500 time 0.6784 (0.5911) data time 0.0006 (0.0020) model time 0.6778 (0.5901) loss 9.7188 (7.6558) grad_norm 2.8565 (2.1944) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:24:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][550/625] eta 0:00:44 lr 0.000852 wd 0.0500 time 0.5771 (0.5909) data time 0.0006 (0.0020) model time 0.5765 (0.5898) loss 8.2038 (7.6601) grad_norm 3.4466 (2.1978) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:25:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][560/625] eta 0:00:38 lr 0.000852 wd 0.0500 time 0.5787 (0.5906) data time 0.0006 (0.0020) model time 0.5781 (0.5896) loss 8.5815 (7.6639) grad_norm 2.0318 (2.1994) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:25:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][570/625] eta 0:00:32 lr 0.000852 wd 0.0500 time 0.5776 (0.5904) data time 0.0008 (0.0020) model time 0.5768 (0.5893) loss 8.9042 (7.6615) grad_norm 2.7341 (2.2034) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:25:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][580/625] eta 0:00:26 lr 0.000852 wd 0.0500 time 0.5731 (0.5902) data time 0.0008 (0.0019) model time 0.5723 (0.5892) loss 9.0906 (7.6636) grad_norm 1.7332 (2.2120) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:25:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][590/625] eta 0:00:20 lr 0.000852 wd 0.0500 time 0.5898 (0.5901) data time 0.0006 (0.0019) model time 0.5892 (0.5890) loss 8.1330 (7.6638) grad_norm 1.8363 (2.2093) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:25:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][600/625] eta 0:00:14 lr 0.000852 wd 0.0500 time 0.5810 (0.5898) data time 0.0006 (0.0019) model time 0.5804 (0.5887) loss 7.9308 (7.6563) grad_norm 1.6617 (2.2054) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:25:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][610/625] eta 0:00:08 lr 0.000852 wd 0.0500 time 0.5769 (0.5897) data time 0.0004 (0.0019) model time 0.5765 (0.5885) loss 7.0882 (7.6500) grad_norm 2.1481 (2.2040) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:25:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [121/300][620/625] eta 0:00:02 lr 0.000852 wd 0.0500 time 0.5784 (0.5895) data time 0.0004 (0.0019) model time 0.5781 (0.5884) loss 7.5705 (7.6481) grad_norm 2.4665 (2.2059) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:25:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 121 training takes 0:06:08 +[2024-07-25 02:25:39 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 02:25:41 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 02:25:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.475 (0.475) Loss 0.5039 (0.5039) Acc@1 89.209 (89.209) Acc@5 98.682 (98.682) Mem 22339MB +[2024-07-25 02:25:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.8320 (0.6420) Acc@1 80.225 (85.649) Acc@5 95.898 (97.603) Mem 22339MB +[2024-07-25 02:25:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.9253 (0.7613) Acc@1 77.246 (82.345) Acc@5 94.385 (96.268) Mem 22339MB +[2024-07-25 02:25:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.032 Acc@5 96.285 +[2024-07-25 02:25:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.0% +[2024-07-25 02:25:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 82.03% +[2024-07-25 02:25:44 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 02:25:46 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 02:25:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.467 (0.467) Loss 0.5029 (0.5029) Acc@1 89.307 (89.307) Acc@5 98.682 (98.682) Mem 22339MB +[2024-07-25 02:25:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.8062 (0.6350) Acc@1 81.006 (86.204) Acc@5 96.143 (97.687) Mem 22339MB +[2024-07-25 02:25:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.9209 (0.7456) Acc@1 76.611 (82.903) Acc@5 95.508 (96.536) Mem 22339MB +[2024-07-25 02:25:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.566 Acc@5 96.547 +[2024-07-25 02:25:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.6% +[2024-07-25 02:25:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][0/625] eta 0:14:28 lr 0.000852 wd 0.0500 time 1.3898 (1.3898) data time 0.6386 (0.6386) model time 0.0000 (0.0000) loss 9.3275 (9.3275) grad_norm 3.0943 (3.0943) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:25:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][10/625] eta 0:06:39 lr 0.000852 wd 0.0500 time 0.5789 (0.6496) data time 0.0006 (0.0588) model time 0.0000 (0.0000) loss 7.5821 (7.6444) grad_norm 2.0150 (2.5449) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:26:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][20/625] eta 0:06:11 lr 0.000851 wd 0.0500 time 0.5742 (0.6145) data time 0.0009 (0.0312) model time 0.0000 (0.0000) loss 8.1571 (7.7301) grad_norm 1.6451 (2.3146) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:26:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][30/625] eta 0:05:58 lr 0.000851 wd 0.0500 time 0.5889 (0.6022) data time 0.0008 (0.0215) model time 0.0000 (0.0000) loss 8.3913 (7.6050) grad_norm 2.6206 (inf) loss_scale 1024.0000 (1849.8065) mem 22339MB +[2024-07-25 02:26:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][40/625] eta 0:05:48 lr 0.000851 wd 0.0500 time 0.5852 (0.5965) data time 0.0008 (0.0164) model time 0.0000 (0.0000) loss 8.7670 (7.6696) grad_norm 2.7497 (inf) loss_scale 1024.0000 (1648.3902) mem 22339MB +[2024-07-25 02:26:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][50/625] eta 0:05:40 lr 0.000851 wd 0.0500 time 0.5874 (0.5930) data time 0.0007 (0.0135) model time 0.0000 (0.0000) loss 8.3429 (7.6626) grad_norm 2.2679 (inf) loss_scale 1024.0000 (1525.9608) mem 22339MB +[2024-07-25 02:26:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][60/625] eta 0:05:34 lr 0.000851 wd 0.0500 time 0.5735 (0.5912) data time 0.0006 (0.0114) model time 0.5729 (0.5812) loss 6.8535 (7.6276) grad_norm 2.0555 (inf) loss_scale 1024.0000 (1443.6721) mem 22339MB +[2024-07-25 02:26:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][70/625] eta 0:05:27 lr 0.000851 wd 0.0500 time 0.5762 (0.5892) data time 0.0008 (0.0099) model time 0.5754 (0.5788) loss 8.6323 (7.6671) grad_norm 4.7140 (inf) loss_scale 1024.0000 (1384.5634) mem 22339MB +[2024-07-25 02:26:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][80/625] eta 0:05:20 lr 0.000851 wd 0.0500 time 0.5913 (0.5884) data time 0.0008 (0.0088) model time 0.5905 (0.5797) loss 8.4234 (7.6869) grad_norm 1.7824 (inf) loss_scale 1024.0000 (1340.0494) mem 22339MB +[2024-07-25 02:26:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][90/625] eta 0:05:14 lr 0.000851 wd 0.0500 time 0.5829 (0.5875) data time 0.0008 (0.0079) model time 0.5821 (0.5797) loss 8.9469 (7.6886) grad_norm 1.9951 (inf) loss_scale 1024.0000 (1305.3187) mem 22339MB +[2024-07-25 02:26:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][100/625] eta 0:05:08 lr 0.000851 wd 0.0500 time 0.5782 (0.5880) data time 0.0008 (0.0072) model time 0.5774 (0.5821) loss 7.4183 (7.7097) grad_norm 2.4692 (inf) loss_scale 1024.0000 (1277.4653) mem 22339MB +[2024-07-25 02:26:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][110/625] eta 0:05:03 lr 0.000851 wd 0.0500 time 0.6261 (0.5898) data time 0.0008 (0.0066) model time 0.6253 (0.5862) loss 7.9421 (7.6871) grad_norm 3.1291 (inf) loss_scale 1024.0000 (1254.6306) mem 22339MB +[2024-07-25 02:27:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][120/625] eta 0:05:00 lr 0.000850 wd 0.0500 time 0.7037 (0.5946) data time 0.0008 (0.0062) model time 0.7028 (0.5949) loss 7.5834 (7.6192) grad_norm 2.6399 (inf) loss_scale 1024.0000 (1235.5702) mem 22339MB +[2024-07-25 02:27:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][130/625] eta 0:04:55 lr 0.000850 wd 0.0500 time 0.7525 (0.5974) data time 0.0009 (0.0058) model time 0.7517 (0.5993) loss 8.2411 (7.6186) grad_norm 1.7391 (inf) loss_scale 1024.0000 (1219.4198) mem 22339MB +[2024-07-25 02:27:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][140/625] eta 0:04:49 lr 0.000850 wd 0.0500 time 0.5746 (0.5978) data time 0.0007 (0.0054) model time 0.5740 (0.5997) loss 8.4117 (7.6017) grad_norm 1.5654 (inf) loss_scale 1024.0000 (1205.5603) mem 22339MB +[2024-07-25 02:27:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][150/625] eta 0:04:43 lr 0.000850 wd 0.0500 time 0.5779 (0.5963) data time 0.0008 (0.0051) model time 0.5771 (0.5971) loss 7.3509 (7.5874) grad_norm 4.2679 (inf) loss_scale 1024.0000 (1193.5364) mem 22339MB +[2024-07-25 02:27:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][160/625] eta 0:04:37 lr 0.000850 wd 0.0500 time 0.5981 (0.5962) data time 0.0006 (0.0049) model time 0.5975 (0.5968) loss 7.8976 (7.6059) grad_norm 3.0429 (inf) loss_scale 1024.0000 (1183.0062) mem 22339MB +[2024-07-25 02:27:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][170/625] eta 0:04:30 lr 0.000850 wd 0.0500 time 0.5936 (0.5951) data time 0.0006 (0.0046) model time 0.5931 (0.5952) loss 9.5891 (7.5988) grad_norm 1.9548 (inf) loss_scale 1024.0000 (1173.7076) mem 22339MB +[2024-07-25 02:27:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][180/625] eta 0:04:24 lr 0.000850 wd 0.0500 time 0.5981 (0.5944) data time 0.0007 (0.0044) model time 0.5973 (0.5940) loss 7.1656 (7.5742) grad_norm 1.9863 (inf) loss_scale 1024.0000 (1165.4365) mem 22339MB +[2024-07-25 02:27:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][190/625] eta 0:04:18 lr 0.000850 wd 0.0500 time 0.6052 (0.5939) data time 0.0006 (0.0043) model time 0.6046 (0.5932) loss 6.2183 (7.5551) grad_norm 1.7450 (inf) loss_scale 1024.0000 (1158.0314) mem 22339MB +[2024-07-25 02:27:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][200/625] eta 0:04:12 lr 0.000850 wd 0.0500 time 0.5821 (0.5930) data time 0.0008 (0.0042) model time 0.5813 (0.5920) loss 8.9549 (7.5723) grad_norm 1.5323 (inf) loss_scale 1024.0000 (1151.3632) mem 22339MB +[2024-07-25 02:27:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][210/625] eta 0:04:05 lr 0.000850 wd 0.0500 time 0.5847 (0.5924) data time 0.0008 (0.0040) model time 0.5840 (0.5912) loss 6.2934 (7.5552) grad_norm 1.8727 (inf) loss_scale 1024.0000 (1145.3270) mem 22339MB +[2024-07-25 02:28:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][220/625] eta 0:03:59 lr 0.000850 wd 0.0500 time 0.5760 (0.5916) data time 0.0008 (0.0039) model time 0.5752 (0.5902) loss 7.6219 (7.5632) grad_norm 1.9464 (inf) loss_scale 1024.0000 (1139.8371) mem 22339MB +[2024-07-25 02:28:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][230/625] eta 0:03:53 lr 0.000849 wd 0.0500 time 0.5784 (0.5910) data time 0.0006 (0.0037) model time 0.5778 (0.5894) loss 7.4930 (7.5664) grad_norm 1.6607 (inf) loss_scale 1024.0000 (1134.8225) mem 22339MB +[2024-07-25 02:28:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][240/625] eta 0:03:47 lr 0.000849 wd 0.0500 time 0.5756 (0.5904) data time 0.0008 (0.0036) model time 0.5749 (0.5887) loss 8.2501 (7.5739) grad_norm 3.6455 (inf) loss_scale 1024.0000 (1130.2241) mem 22339MB +[2024-07-25 02:28:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][250/625] eta 0:03:41 lr 0.000849 wd 0.0500 time 0.5853 (0.5899) data time 0.0006 (0.0035) model time 0.5847 (0.5882) loss 6.8738 (7.5755) grad_norm 2.2154 (inf) loss_scale 1024.0000 (1125.9920) mem 22339MB +[2024-07-25 02:28:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][260/625] eta 0:03:35 lr 0.000849 wd 0.0500 time 0.5907 (0.5895) data time 0.0006 (0.0034) model time 0.5901 (0.5877) loss 7.1420 (7.5601) grad_norm 1.5711 (inf) loss_scale 1024.0000 (1122.0843) mem 22339MB +[2024-07-25 02:28:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][270/625] eta 0:03:29 lr 0.000849 wd 0.0500 time 0.5758 (0.5891) data time 0.0008 (0.0033) model time 0.5750 (0.5873) loss 6.5259 (7.5558) grad_norm 2.3202 (inf) loss_scale 1024.0000 (1118.4649) mem 22339MB +[2024-07-25 02:28:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][280/625] eta 0:03:23 lr 0.000849 wd 0.0500 time 0.5819 (0.5887) data time 0.0006 (0.0032) model time 0.5814 (0.5868) loss 8.1984 (7.5599) grad_norm 4.8011 (inf) loss_scale 1024.0000 (1115.1032) mem 22339MB +[2024-07-25 02:28:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][290/625] eta 0:03:17 lr 0.000849 wd 0.0500 time 0.5772 (0.5882) data time 0.0007 (0.0032) model time 0.5765 (0.5863) loss 7.2683 (7.5707) grad_norm 2.9182 (inf) loss_scale 1024.0000 (1111.9725) mem 22339MB +[2024-07-25 02:28:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][300/625] eta 0:03:11 lr 0.000849 wd 0.0500 time 0.5843 (0.5879) data time 0.0006 (0.0031) model time 0.5837 (0.5859) loss 6.9894 (7.5917) grad_norm 2.4700 (inf) loss_scale 1024.0000 (1109.0498) mem 22339MB +[2024-07-25 02:28:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][310/625] eta 0:03:05 lr 0.000849 wd 0.0500 time 0.5831 (0.5876) data time 0.0008 (0.0030) model time 0.5824 (0.5855) loss 8.8379 (7.5925) grad_norm 2.3710 (inf) loss_scale 1024.0000 (1106.3151) mem 22339MB +[2024-07-25 02:28:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][320/625] eta 0:02:59 lr 0.000849 wd 0.0500 time 0.5771 (0.5876) data time 0.0008 (0.0029) model time 0.5763 (0.5857) loss 8.2103 (7.5941) grad_norm 1.5725 (inf) loss_scale 1024.0000 (1103.7508) mem 22339MB +[2024-07-25 02:29:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][330/625] eta 0:02:53 lr 0.000848 wd 0.0500 time 0.7361 (0.5883) data time 0.0008 (0.0029) model time 0.7352 (0.5865) loss 7.6786 (7.5932) grad_norm 2.1123 (inf) loss_scale 1024.0000 (1101.3414) mem 22339MB +[2024-07-25 02:29:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][340/625] eta 0:02:48 lr 0.000848 wd 0.0500 time 0.6986 (0.5896) data time 0.0006 (0.0028) model time 0.6980 (0.5881) loss 8.6880 (7.6029) grad_norm 1.5922 (inf) loss_scale 1024.0000 (1099.0733) mem 22339MB +[2024-07-25 02:29:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][350/625] eta 0:02:42 lr 0.000848 wd 0.0500 time 0.5820 (0.5905) data time 0.0006 (0.0028) model time 0.5814 (0.5891) loss 5.7909 (7.5848) grad_norm 1.6215 (inf) loss_scale 1024.0000 (1096.9345) mem 22339MB +[2024-07-25 02:29:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][360/625] eta 0:02:36 lr 0.000848 wd 0.0500 time 0.5842 (0.5915) data time 0.0006 (0.0027) model time 0.5836 (0.5903) loss 7.1584 (7.5696) grad_norm 1.9550 (inf) loss_scale 1024.0000 (1094.9141) mem 22339MB +[2024-07-25 02:29:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][370/625] eta 0:02:30 lr 0.000848 wd 0.0500 time 0.6996 (0.5914) data time 0.0008 (0.0027) model time 0.6989 (0.5903) loss 9.0349 (7.5863) grad_norm 2.6761 (inf) loss_scale 1024.0000 (1093.0027) mem 22339MB +[2024-07-25 02:29:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][380/625] eta 0:02:24 lr 0.000848 wd 0.0500 time 0.5760 (0.5909) data time 0.0008 (0.0026) model time 0.5752 (0.5896) loss 8.7186 (7.5799) grad_norm 1.9381 (inf) loss_scale 1024.0000 (1091.1916) mem 22339MB +[2024-07-25 02:29:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][390/625] eta 0:02:18 lr 0.000848 wd 0.0500 time 0.5747 (0.5905) data time 0.0008 (0.0026) model time 0.5740 (0.5892) loss 8.9364 (7.5846) grad_norm 2.0329 (inf) loss_scale 1024.0000 (1089.4731) mem 22339MB +[2024-07-25 02:29:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][400/625] eta 0:02:12 lr 0.000848 wd 0.0500 time 0.5792 (0.5901) data time 0.0008 (0.0025) model time 0.5784 (0.5888) loss 6.9428 (7.5828) grad_norm 2.2447 (inf) loss_scale 1024.0000 (1087.8404) mem 22339MB +[2024-07-25 02:29:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][410/625] eta 0:02:06 lr 0.000848 wd 0.0500 time 0.5867 (0.5899) data time 0.0007 (0.0025) model time 0.5860 (0.5885) loss 8.2993 (7.5702) grad_norm 2.0254 (inf) loss_scale 1024.0000 (1086.2871) mem 22339MB +[2024-07-25 02:29:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][420/625] eta 0:02:00 lr 0.000848 wd 0.0500 time 0.6042 (0.5896) data time 0.0010 (0.0024) model time 0.6032 (0.5882) loss 8.6759 (7.5791) grad_norm 2.0347 (inf) loss_scale 1024.0000 (1084.8076) mem 22339MB +[2024-07-25 02:30:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][430/625] eta 0:01:54 lr 0.000847 wd 0.0500 time 0.5868 (0.5894) data time 0.0008 (0.0024) model time 0.5861 (0.5880) loss 5.8899 (7.5836) grad_norm 1.8195 (inf) loss_scale 1024.0000 (1083.3968) mem 22339MB +[2024-07-25 02:30:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][440/625] eta 0:01:49 lr 0.000847 wd 0.0500 time 0.5963 (0.5893) data time 0.0008 (0.0024) model time 0.5955 (0.5879) loss 6.8551 (7.5778) grad_norm 2.4439 (inf) loss_scale 1024.0000 (1082.0499) mem 22339MB +[2024-07-25 02:30:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][450/625] eta 0:01:43 lr 0.000847 wd 0.0500 time 0.5871 (0.5891) data time 0.0008 (0.0024) model time 0.5863 (0.5877) loss 7.8535 (7.5725) grad_norm 2.8443 (inf) loss_scale 1024.0000 (1080.7627) mem 22339MB +[2024-07-25 02:30:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][460/625] eta 0:01:37 lr 0.000847 wd 0.0500 time 0.5869 (0.5889) data time 0.0008 (0.0023) model time 0.5861 (0.5874) loss 8.8010 (7.5788) grad_norm 2.6022 (inf) loss_scale 1024.0000 (1079.5315) mem 22339MB +[2024-07-25 02:30:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][470/625] eta 0:01:31 lr 0.000847 wd 0.0500 time 0.5755 (0.5886) data time 0.0008 (0.0023) model time 0.5747 (0.5871) loss 7.4524 (7.5767) grad_norm 1.7430 (inf) loss_scale 1024.0000 (1078.3524) mem 22339MB +[2024-07-25 02:30:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][480/625] eta 0:01:25 lr 0.000847 wd 0.0500 time 0.5825 (0.5883) data time 0.0008 (0.0023) model time 0.5817 (0.5868) loss 8.6946 (7.5866) grad_norm 1.8720 (inf) loss_scale 1024.0000 (1077.2225) mem 22339MB +[2024-07-25 02:30:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][490/625] eta 0:01:19 lr 0.000847 wd 0.0500 time 0.5751 (0.5881) data time 0.0006 (0.0022) model time 0.5744 (0.5866) loss 7.9433 (7.5868) grad_norm 2.4810 (inf) loss_scale 1024.0000 (1076.1385) mem 22339MB +[2024-07-25 02:30:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][500/625] eta 0:01:13 lr 0.000847 wd 0.0500 time 0.6099 (0.5879) data time 0.0008 (0.0022) model time 0.6091 (0.5864) loss 5.7528 (7.5801) grad_norm 2.0332 (inf) loss_scale 1024.0000 (1075.0978) mem 22339MB +[2024-07-25 02:30:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][510/625] eta 0:01:07 lr 0.000847 wd 0.0500 time 0.5772 (0.5876) data time 0.0006 (0.0022) model time 0.5766 (0.5861) loss 7.1973 (7.5836) grad_norm 2.8515 (inf) loss_scale 1024.0000 (1074.0978) mem 22339MB +[2024-07-25 02:30:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][520/625] eta 0:01:01 lr 0.000847 wd 0.0500 time 0.5789 (0.5874) data time 0.0008 (0.0022) model time 0.5781 (0.5859) loss 9.0834 (7.5874) grad_norm 2.7900 (inf) loss_scale 1024.0000 (1073.1363) mem 22339MB +[2024-07-25 02:31:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][530/625] eta 0:00:55 lr 0.000846 wd 0.0500 time 0.5830 (0.5872) data time 0.0008 (0.0021) model time 0.5823 (0.5857) loss 7.7258 (7.5842) grad_norm 2.7169 (inf) loss_scale 1024.0000 (1072.2109) mem 22339MB +[2024-07-25 02:31:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][540/625] eta 0:00:49 lr 0.000846 wd 0.0500 time 0.5764 (0.5872) data time 0.0008 (0.0021) model time 0.5756 (0.5857) loss 8.9519 (7.5942) grad_norm 2.7710 (inf) loss_scale 1024.0000 (1071.3198) mem 22339MB +[2024-07-25 02:31:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][550/625] eta 0:00:44 lr 0.000846 wd 0.0500 time 0.7407 (0.5874) data time 0.0006 (0.0021) model time 0.7400 (0.5859) loss 7.3424 (7.5926) grad_norm 2.7949 (inf) loss_scale 1024.0000 (1070.4610) mem 22339MB +[2024-07-25 02:31:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][560/625] eta 0:00:38 lr 0.000846 wd 0.0500 time 0.7518 (0.5888) data time 0.0006 (0.0021) model time 0.7512 (0.5874) loss 8.5348 (7.5845) grad_norm 3.0306 (inf) loss_scale 1024.0000 (1069.6328) mem 22339MB +[2024-07-25 02:31:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][570/625] eta 0:00:32 lr 0.000846 wd 0.0500 time 0.7055 (0.5893) data time 0.0006 (0.0020) model time 0.7049 (0.5880) loss 7.6983 (7.5927) grad_norm 2.3479 (inf) loss_scale 1024.0000 (1068.8336) mem 22339MB +[2024-07-25 02:31:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][580/625] eta 0:00:26 lr 0.000846 wd 0.0500 time 0.5820 (0.5899) data time 0.0007 (0.0020) model time 0.5813 (0.5887) loss 7.9449 (7.5912) grad_norm 2.2740 (inf) loss_scale 1024.0000 (1068.0620) mem 22339MB +[2024-07-25 02:31:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][590/625] eta 0:00:20 lr 0.000846 wd 0.0500 time 0.5734 (0.5897) data time 0.0008 (0.0020) model time 0.5726 (0.5884) loss 7.6179 (7.5946) grad_norm 2.7015 (inf) loss_scale 1024.0000 (1067.3164) mem 22339MB +[2024-07-25 02:31:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][600/625] eta 0:00:14 lr 0.000846 wd 0.0500 time 0.5752 (0.5896) data time 0.0006 (0.0020) model time 0.5746 (0.5884) loss 7.6266 (7.6034) grad_norm 1.9128 (inf) loss_scale 1024.0000 (1066.5957) mem 22339MB +[2024-07-25 02:31:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][610/625] eta 0:00:08 lr 0.000846 wd 0.0500 time 0.5721 (0.5894) data time 0.0004 (0.0020) model time 0.5716 (0.5882) loss 7.4384 (7.6061) grad_norm 2.9226 (inf) loss_scale 1024.0000 (1065.8985) mem 22339MB +[2024-07-25 02:31:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [122/300][620/625] eta 0:00:02 lr 0.000846 wd 0.0500 time 0.5804 (0.5892) data time 0.0004 (0.0019) model time 0.5800 (0.5879) loss 6.2597 (7.6163) grad_norm 1.9064 (inf) loss_scale 1024.0000 (1065.2238) mem 22339MB +[2024-07-25 02:31:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 122 training takes 0:06:08 +[2024-07-25 02:31:57 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 02:31:59 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 02:31:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.469 (0.469) Loss 0.5161 (0.5161) Acc@1 89.404 (89.404) Acc@5 98.486 (98.486) Mem 22339MB +[2024-07-25 02:32:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.8706 (0.6551) Acc@1 78.809 (85.307) Acc@5 95.557 (97.474) Mem 22339MB +[2024-07-25 02:32:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.9512 (0.7749) Acc@1 76.709 (82.085) Acc@5 94.141 (96.154) Mem 22339MB +[2024-07-25 02:32:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.742 Acc@5 96.157 +[2024-07-25 02:32:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.7% +[2024-07-25 02:32:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.935 (0.935) Loss 0.5029 (0.5029) Acc@1 89.404 (89.404) Acc@5 98.682 (98.682) Mem 22339MB +[2024-07-25 02:32:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.199) Loss 0.8057 (0.6346) Acc@1 81.055 (86.226) Acc@5 96.191 (97.687) Mem 22339MB +[2024-07-25 02:32:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.164) Loss 0.9204 (0.7449) Acc@1 76.660 (82.922) Acc@5 95.605 (96.547) Mem 22339MB +[2024-07-25 02:32:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.588 Acc@5 96.559 +[2024-07-25 02:32:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.6% +[2024-07-25 02:32:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.59% +[2024-07-25 02:32:06 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 02:32:08 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 02:32:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][0/625] eta 0:08:58 lr 0.000846 wd 0.0500 time 0.8622 (0.8622) data time 0.3447 (0.3447) model time 0.0000 (0.0000) loss 8.3517 (8.3517) grad_norm 3.2214 (3.2214) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:32:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][10/625] eta 0:06:11 lr 0.000845 wd 0.0500 time 0.5716 (0.6033) data time 0.0008 (0.0321) model time 0.0000 (0.0000) loss 7.8849 (7.5434) grad_norm 2.4176 (2.2133) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:32:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][20/625] eta 0:05:57 lr 0.000845 wd 0.0500 time 0.5728 (0.5905) data time 0.0006 (0.0172) model time 0.0000 (0.0000) loss 7.9671 (7.4867) grad_norm 1.6170 (2.1225) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:32:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][30/625] eta 0:05:48 lr 0.000845 wd 0.0500 time 0.5723 (0.5863) data time 0.0006 (0.0119) model time 0.0000 (0.0000) loss 6.3613 (7.6685) grad_norm 2.4429 (2.1921) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:32:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][40/625] eta 0:05:41 lr 0.000845 wd 0.0500 time 0.5708 (0.5838) data time 0.0006 (0.0092) model time 0.0000 (0.0000) loss 7.4667 (7.6756) grad_norm 2.9669 (2.2260) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:32:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][50/625] eta 0:05:34 lr 0.000845 wd 0.0500 time 0.5723 (0.5824) data time 0.0006 (0.0076) model time 0.0000 (0.0000) loss 6.8927 (7.6303) grad_norm 2.0378 (2.2237) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:32:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][60/625] eta 0:05:28 lr 0.000845 wd 0.0500 time 0.5643 (0.5818) data time 0.0008 (0.0065) model time 0.5635 (0.5781) loss 7.5458 (7.5867) grad_norm 2.5007 (2.2849) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:32:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][70/625] eta 0:05:22 lr 0.000845 wd 0.0500 time 0.5734 (0.5809) data time 0.0008 (0.0057) model time 0.5727 (0.5763) loss 7.1663 (7.5871) grad_norm 1.7754 (2.2326) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:32:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][80/625] eta 0:05:16 lr 0.000845 wd 0.0500 time 0.5606 (0.5808) data time 0.0008 (0.0051) model time 0.5597 (0.5772) loss 7.7370 (7.5787) grad_norm 3.5354 (2.2345) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:33:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][90/625] eta 0:05:10 lr 0.000845 wd 0.0500 time 0.5686 (0.5801) data time 0.0008 (0.0046) model time 0.5678 (0.5764) loss 8.7903 (7.5831) grad_norm 1.8349 (2.2178) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:33:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][100/625] eta 0:05:04 lr 0.000845 wd 0.0500 time 0.5699 (0.5800) data time 0.0008 (0.0042) model time 0.5691 (0.5768) loss 8.9475 (7.6096) grad_norm 1.6557 (2.1936) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:33:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][110/625] eta 0:04:58 lr 0.000844 wd 0.0500 time 0.5613 (0.5803) data time 0.0006 (0.0039) model time 0.5606 (0.5777) loss 6.0481 (7.5757) grad_norm 2.8370 (2.1785) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:33:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][120/625] eta 0:04:52 lr 0.000844 wd 0.0500 time 0.5647 (0.5800) data time 0.0008 (0.0037) model time 0.5639 (0.5774) loss 8.3332 (7.5913) grad_norm 1.7502 (2.1783) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:33:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][130/625] eta 0:04:47 lr 0.000844 wd 0.0500 time 0.5679 (0.5813) data time 0.0006 (0.0037) model time 0.5673 (0.5793) loss 8.1029 (7.5925) grad_norm 1.7239 (2.1730) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:33:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][140/625] eta 0:04:42 lr 0.000844 wd 0.0500 time 0.5731 (0.5822) data time 0.0009 (0.0035) model time 0.5722 (0.5809) loss 8.1218 (7.6135) grad_norm 2.2590 (2.1643) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:33:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][150/625] eta 0:04:38 lr 0.000844 wd 0.0500 time 0.7094 (0.5860) data time 0.0008 (0.0033) model time 0.7086 (0.5867) loss 8.1419 (7.6016) grad_norm 2.8151 (2.1513) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:33:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][160/625] eta 0:04:34 lr 0.000844 wd 0.0500 time 0.7361 (0.5899) data time 0.0006 (0.0032) model time 0.7355 (0.5923) loss 7.9068 (7.6280) grad_norm 2.2356 (2.1608) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:33:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][170/625] eta 0:04:29 lr 0.000844 wd 0.0500 time 0.5688 (0.5924) data time 0.0009 (0.0030) model time 0.5680 (0.5955) loss 6.5846 (7.6212) grad_norm 1.9752 (2.1789) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:33:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][180/625] eta 0:04:23 lr 0.000844 wd 0.0500 time 0.5726 (0.5925) data time 0.0006 (0.0029) model time 0.5720 (0.5954) loss 8.8684 (7.6197) grad_norm 2.5466 (2.1988) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:34:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][190/625] eta 0:04:17 lr 0.000844 wd 0.0500 time 0.5677 (0.5916) data time 0.0009 (0.0028) model time 0.5667 (0.5940) loss 7.1885 (7.6404) grad_norm 1.9354 (2.2037) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:34:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][200/625] eta 0:04:11 lr 0.000844 wd 0.0500 time 0.5682 (0.5910) data time 0.0008 (0.0027) model time 0.5674 (0.5928) loss 8.8779 (7.6544) grad_norm 3.3157 (2.2167) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:34:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][210/625] eta 0:04:04 lr 0.000844 wd 0.0500 time 0.5703 (0.5901) data time 0.0007 (0.0026) model time 0.5695 (0.5916) loss 8.7385 (7.6531) grad_norm 2.2939 (2.2258) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:34:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][220/625] eta 0:03:58 lr 0.000843 wd 0.0500 time 0.5726 (0.5895) data time 0.0006 (0.0025) model time 0.5719 (0.5906) loss 7.8363 (7.6655) grad_norm 2.2241 (2.2293) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:34:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][230/625] eta 0:03:52 lr 0.000843 wd 0.0500 time 0.5630 (0.5889) data time 0.0006 (0.0025) model time 0.5624 (0.5897) loss 8.4829 (7.6552) grad_norm 3.2980 (2.2398) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:34:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][240/625] eta 0:03:46 lr 0.000843 wd 0.0500 time 0.5689 (0.5884) data time 0.0006 (0.0024) model time 0.5683 (0.5890) loss 7.5724 (7.6413) grad_norm 3.7186 (2.2596) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:34:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][250/625] eta 0:03:40 lr 0.000843 wd 0.0500 time 0.5686 (0.5879) data time 0.0007 (0.0023) model time 0.5679 (0.5883) loss 8.6354 (7.6514) grad_norm 3.0067 (2.2532) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:34:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][260/625] eta 0:03:34 lr 0.000843 wd 0.0500 time 0.5742 (0.5874) data time 0.0008 (0.0023) model time 0.5734 (0.5876) loss 7.3285 (7.6365) grad_norm 3.0643 (2.2476) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:34:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][270/625] eta 0:03:28 lr 0.000843 wd 0.0500 time 0.5714 (0.5870) data time 0.0006 (0.0022) model time 0.5708 (0.5871) loss 5.9153 (7.6325) grad_norm 1.9956 (2.2487) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:34:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][280/625] eta 0:03:22 lr 0.000843 wd 0.0500 time 0.5720 (0.5865) data time 0.0008 (0.0022) model time 0.5713 (0.5864) loss 7.6039 (7.6436) grad_norm 2.2189 (2.2412) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:34:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][290/625] eta 0:03:16 lr 0.000843 wd 0.0500 time 0.5694 (0.5861) data time 0.0008 (0.0021) model time 0.5686 (0.5860) loss 8.0090 (7.6370) grad_norm 1.8697 (2.2349) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:35:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][300/625] eta 0:03:10 lr 0.000843 wd 0.0500 time 0.5687 (0.5858) data time 0.0007 (0.0021) model time 0.5681 (0.5855) loss 7.3662 (7.6389) grad_norm 2.5643 (2.2435) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:35:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][310/625] eta 0:03:04 lr 0.000843 wd 0.0500 time 0.5722 (0.5855) data time 0.0006 (0.0021) model time 0.5716 (0.5851) loss 8.1729 (7.6267) grad_norm 3.0273 (2.2510) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:35:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][320/625] eta 0:02:58 lr 0.000842 wd 0.0500 time 0.5694 (0.5852) data time 0.0006 (0.0020) model time 0.5687 (0.5848) loss 6.9402 (7.6145) grad_norm 2.3194 (2.2531) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:35:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][330/625] eta 0:02:52 lr 0.000842 wd 0.0500 time 0.5752 (0.5849) data time 0.0007 (0.0020) model time 0.5744 (0.5844) loss 7.0810 (7.6289) grad_norm 1.6498 (2.2489) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:35:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][340/625] eta 0:02:46 lr 0.000842 wd 0.0500 time 0.5717 (0.5847) data time 0.0008 (0.0019) model time 0.5710 (0.5841) loss 6.9625 (7.6184) grad_norm 1.7604 (2.2523) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:35:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][350/625] eta 0:02:40 lr 0.000842 wd 0.0500 time 0.5825 (0.5848) data time 0.0009 (0.0019) model time 0.5816 (0.5842) loss 6.6466 (7.6108) grad_norm 1.7861 (2.2428) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:35:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][360/625] eta 0:02:35 lr 0.000842 wd 0.0500 time 0.5713 (0.5852) data time 0.0008 (0.0019) model time 0.5705 (0.5847) loss 6.3775 (7.6079) grad_norm 4.1995 (2.2446) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:35:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][370/625] eta 0:02:29 lr 0.000842 wd 0.0500 time 0.6906 (0.5866) data time 0.0006 (0.0019) model time 0.6900 (0.5863) loss 7.1173 (7.6080) grad_norm 2.4856 (2.2404) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:35:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][380/625] eta 0:02:24 lr 0.000842 wd 0.0500 time 0.7305 (0.5879) data time 0.0007 (0.0018) model time 0.7298 (0.5878) loss 6.7234 (7.6107) grad_norm 2.3416 (2.2356) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:35:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][390/625] eta 0:02:18 lr 0.000842 wd 0.0500 time 0.5731 (0.5883) data time 0.0006 (0.0018) model time 0.5725 (0.5882) loss 6.1610 (7.6081) grad_norm 2.0266 (2.2292) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:36:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][400/625] eta 0:02:12 lr 0.000842 wd 0.0500 time 0.5741 (0.5883) data time 0.0006 (0.0018) model time 0.5734 (0.5883) loss 6.8916 (7.5988) grad_norm 1.5814 (2.2249) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:36:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][410/625] eta 0:02:06 lr 0.000842 wd 0.0500 time 0.5699 (0.5880) data time 0.0007 (0.0017) model time 0.5692 (0.5879) loss 7.2557 (7.6030) grad_norm 4.4142 (2.2316) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:36:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][420/625] eta 0:02:00 lr 0.000841 wd 0.0500 time 0.5627 (0.5877) data time 0.0006 (0.0017) model time 0.5621 (0.5876) loss 8.1738 (7.6045) grad_norm 1.9710 (2.2410) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:36:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][430/625] eta 0:01:54 lr 0.000841 wd 0.0500 time 0.5706 (0.5874) data time 0.0007 (0.0017) model time 0.5699 (0.5872) loss 6.2338 (7.5950) grad_norm 2.3785 (2.2481) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:36:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][440/625] eta 0:01:48 lr 0.000841 wd 0.0500 time 0.5719 (0.5871) data time 0.0006 (0.0017) model time 0.5713 (0.5869) loss 6.4286 (7.6018) grad_norm 2.0736 (2.2423) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:36:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][450/625] eta 0:01:42 lr 0.000841 wd 0.0500 time 0.5644 (0.5869) data time 0.0006 (0.0017) model time 0.5637 (0.5865) loss 6.6002 (7.5917) grad_norm 1.8570 (2.2362) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:36:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][460/625] eta 0:01:36 lr 0.000841 wd 0.0500 time 0.5724 (0.5867) data time 0.0008 (0.0017) model time 0.5717 (0.5863) loss 7.0858 (7.5876) grad_norm 2.2517 (2.2349) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:36:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][470/625] eta 0:01:30 lr 0.000841 wd 0.0500 time 0.5763 (0.5865) data time 0.0006 (0.0016) model time 0.5757 (0.5860) loss 6.1867 (7.5867) grad_norm 1.8308 (2.2340) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:36:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][480/625] eta 0:01:25 lr 0.000841 wd 0.0500 time 0.5659 (0.5862) data time 0.0006 (0.0016) model time 0.5653 (0.5857) loss 6.9321 (7.5870) grad_norm 3.1106 (2.2315) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:36:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][490/625] eta 0:01:19 lr 0.000841 wd 0.0500 time 0.5728 (0.5860) data time 0.0006 (0.0016) model time 0.5722 (0.5855) loss 6.5427 (7.5805) grad_norm 2.0575 (2.2281) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:37:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][500/625] eta 0:01:13 lr 0.000841 wd 0.0500 time 0.5653 (0.5858) data time 0.0006 (0.0016) model time 0.5648 (0.5853) loss 9.4973 (7.5774) grad_norm 2.9189 (2.2295) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:37:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][510/625] eta 0:01:07 lr 0.000841 wd 0.0500 time 0.5673 (0.5856) data time 0.0008 (0.0016) model time 0.5664 (0.5851) loss 7.7539 (7.5785) grad_norm 3.2444 (2.2318) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:37:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][520/625] eta 0:01:01 lr 0.000840 wd 0.0500 time 0.5740 (0.5855) data time 0.0006 (0.0016) model time 0.5734 (0.5849) loss 8.4803 (7.5823) grad_norm 2.2780 (2.2290) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:37:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][530/625] eta 0:00:55 lr 0.000840 wd 0.0500 time 0.5685 (0.5852) data time 0.0006 (0.0015) model time 0.5679 (0.5846) loss 6.3101 (7.5909) grad_norm 1.8395 (2.2233) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:37:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][540/625] eta 0:00:49 lr 0.000840 wd 0.0500 time 0.5668 (0.5851) data time 0.0008 (0.0015) model time 0.5660 (0.5845) loss 6.5719 (7.5908) grad_norm 1.8563 (2.2199) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:37:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][550/625] eta 0:00:43 lr 0.000840 wd 0.0500 time 0.5755 (0.5850) data time 0.0007 (0.0015) model time 0.5748 (0.5843) loss 7.1165 (7.5901) grad_norm 2.2947 (2.2176) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:37:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][560/625] eta 0:00:38 lr 0.000840 wd 0.0500 time 0.5692 (0.5849) data time 0.0008 (0.0015) model time 0.5684 (0.5842) loss 8.2698 (7.5935) grad_norm 2.7483 (2.2210) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:37:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][570/625] eta 0:00:32 lr 0.000840 wd 0.0500 time 0.5735 (0.5850) data time 0.0006 (0.0015) model time 0.5729 (0.5843) loss 8.1658 (7.5961) grad_norm 2.1213 (2.2257) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:37:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][580/625] eta 0:00:26 lr 0.000840 wd 0.0500 time 0.5743 (0.5853) data time 0.0008 (0.0015) model time 0.5735 (0.5847) loss 6.8169 (7.5943) grad_norm 1.5151 (2.2223) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:37:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][590/625] eta 0:00:20 lr 0.000840 wd 0.0500 time 0.5596 (0.5857) data time 0.0006 (0.0015) model time 0.5589 (0.5852) loss 8.0125 (7.5980) grad_norm 1.8066 (2.2178) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:38:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][600/625] eta 0:00:14 lr 0.000840 wd 0.0500 time 0.6958 (0.5871) data time 0.0008 (0.0015) model time 0.6950 (0.5866) loss 8.8956 (7.5977) grad_norm 1.7225 (2.2150) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:38:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][610/625] eta 0:00:08 lr 0.000840 wd 0.0500 time 0.7176 (0.5876) data time 0.0006 (0.0015) model time 0.7169 (0.5872) loss 7.9708 (7.5930) grad_norm 2.1106 (2.2125) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:38:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [123/300][620/625] eta 0:00:02 lr 0.000840 wd 0.0500 time 0.5712 (0.5877) data time 0.0006 (0.0015) model time 0.5706 (0.5873) loss 8.2614 (7.5945) grad_norm 1.9615 (2.2098) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:38:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 123 training takes 0:06:07 +[2024-07-25 02:38:15 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 02:38:17 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 02:38:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.476 (0.476) Loss 0.5444 (0.5444) Acc@1 88.867 (88.867) Acc@5 98.193 (98.193) Mem 22339MB +[2024-07-25 02:38:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.8955 (0.6678) Acc@1 78.174 (85.436) Acc@5 95.605 (97.492) Mem 22339MB +[2024-07-25 02:38:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.9849 (0.7851) Acc@1 75.781 (82.141) Acc@5 94.189 (96.170) Mem 22339MB +[2024-07-25 02:38:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.754 Acc@5 96.133 +[2024-07-25 02:38:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.8% +[2024-07-25 02:38:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.867 (0.867) Loss 0.5029 (0.5029) Acc@1 89.307 (89.307) Acc@5 98.682 (98.682) Mem 22339MB +[2024-07-25 02:38:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.193) Loss 0.8042 (0.6341) Acc@1 80.908 (86.208) Acc@5 96.191 (97.705) Mem 22339MB +[2024-07-25 02:38:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.161) Loss 0.9204 (0.7443) Acc@1 76.807 (82.896) Acc@5 95.605 (96.570) Mem 22339MB +[2024-07-25 02:38:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.568 Acc@5 96.581 +[2024-07-25 02:38:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.6% +[2024-07-25 02:38:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][0/625] eta 0:14:14 lr 0.000839 wd 0.0500 time 1.3679 (1.3679) data time 0.6702 (0.6702) model time 0.0000 (0.0000) loss 9.6371 (9.6371) grad_norm 2.5172 (2.5172) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:38:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][10/625] eta 0:06:38 lr 0.000839 wd 0.0500 time 0.5754 (0.6484) data time 0.0006 (0.0617) model time 0.0000 (0.0000) loss 9.2517 (8.0503) grad_norm 2.6346 (2.1535) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:38:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][20/625] eta 0:06:11 lr 0.000839 wd 0.0500 time 0.5740 (0.6135) data time 0.0008 (0.0327) model time 0.0000 (0.0000) loss 8.1113 (7.7235) grad_norm 1.6779 (2.0963) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:38:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][30/625] eta 0:05:58 lr 0.000839 wd 0.0500 time 0.5918 (0.6028) data time 0.0008 (0.0225) model time 0.0000 (0.0000) loss 8.4574 (7.8830) grad_norm 2.0849 (2.3343) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:38:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][40/625] eta 0:05:49 lr 0.000839 wd 0.0500 time 0.5906 (0.5973) data time 0.0008 (0.0174) model time 0.0000 (0.0000) loss 8.1572 (7.6899) grad_norm 2.2481 (2.2815) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:38:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][50/625] eta 0:05:41 lr 0.000839 wd 0.0500 time 0.5957 (0.5946) data time 0.0006 (0.0143) model time 0.0000 (0.0000) loss 7.8805 (7.6074) grad_norm 1.7849 (2.2026) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:39:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][60/625] eta 0:05:34 lr 0.000839 wd 0.0500 time 0.5771 (0.5920) data time 0.0008 (0.0121) model time 0.5763 (0.5779) loss 7.8660 (7.5060) grad_norm 2.3880 (2.2089) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:39:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][70/625] eta 0:05:27 lr 0.000839 wd 0.0500 time 0.5760 (0.5897) data time 0.0006 (0.0105) model time 0.5754 (0.5761) loss 6.1847 (7.4690) grad_norm 3.2548 (2.2619) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:39:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][80/625] eta 0:05:20 lr 0.000839 wd 0.0500 time 0.5741 (0.5879) data time 0.0008 (0.0096) model time 0.5733 (0.5748) loss 7.9738 (7.4949) grad_norm 1.8653 (2.2533) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:39:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][90/625] eta 0:05:13 lr 0.000839 wd 0.0500 time 0.5725 (0.5863) data time 0.0008 (0.0086) model time 0.5717 (0.5743) loss 8.1936 (7.5402) grad_norm 2.6411 (2.4152) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:39:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][100/625] eta 0:05:07 lr 0.000838 wd 0.0500 time 0.5710 (0.5852) data time 0.0006 (0.0078) model time 0.5704 (0.5744) loss 7.8127 (7.5377) grad_norm 2.7551 (2.3920) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:39:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][110/625] eta 0:05:00 lr 0.000838 wd 0.0500 time 0.5729 (0.5843) data time 0.0006 (0.0072) model time 0.5723 (0.5744) loss 7.8673 (7.5643) grad_norm 1.7565 (2.4161) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:39:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][120/625] eta 0:04:54 lr 0.000838 wd 0.0500 time 0.5740 (0.5836) data time 0.0006 (0.0067) model time 0.5733 (0.5744) loss 6.5912 (7.5971) grad_norm 2.3634 (2.4072) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:39:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][130/625] eta 0:04:49 lr 0.000838 wd 0.0500 time 0.5754 (0.5845) data time 0.0007 (0.0062) model time 0.5746 (0.5770) loss 7.6338 (7.5752) grad_norm 1.5577 (2.3732) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:39:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][140/625] eta 0:04:43 lr 0.000838 wd 0.0500 time 0.5746 (0.5839) data time 0.0006 (0.0058) model time 0.5740 (0.5768) loss 8.6696 (7.5838) grad_norm 1.7340 (2.3448) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:39:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][150/625] eta 0:04:37 lr 0.000838 wd 0.0500 time 0.5894 (0.5834) data time 0.0008 (0.0055) model time 0.5886 (0.5767) loss 7.1414 (7.6094) grad_norm 2.3047 (2.3398) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:39:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][160/625] eta 0:04:31 lr 0.000838 wd 0.0500 time 0.5760 (0.5829) data time 0.0008 (0.0052) model time 0.5753 (0.5765) loss 8.1995 (7.5790) grad_norm 2.6147 (2.3486) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:40:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][170/625] eta 0:04:25 lr 0.000838 wd 0.0500 time 0.5738 (0.5825) data time 0.0006 (0.0050) model time 0.5731 (0.5764) loss 7.5623 (7.5772) grad_norm 3.1835 (2.3406) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:40:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][180/625] eta 0:04:20 lr 0.000838 wd 0.0500 time 0.7404 (0.5843) data time 0.0008 (0.0047) model time 0.7396 (0.5793) loss 5.9215 (7.5708) grad_norm 2.2813 (2.3262) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:40:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][190/625] eta 0:04:15 lr 0.000838 wd 0.0500 time 0.5748 (0.5869) data time 0.0008 (0.0045) model time 0.5741 (0.5832) loss 8.8395 (7.5785) grad_norm 3.2562 (2.3314) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:40:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][200/625] eta 0:04:11 lr 0.000837 wd 0.0500 time 0.7040 (0.5911) data time 0.0007 (0.0043) model time 0.7033 (0.5890) loss 7.2683 (7.5744) grad_norm 2.1214 (2.3593) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:40:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][210/625] eta 0:04:05 lr 0.000837 wd 0.0500 time 0.5853 (0.5915) data time 0.0008 (0.0042) model time 0.5845 (0.5896) loss 8.0066 (7.5535) grad_norm 1.8388 (2.3665) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:40:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][220/625] eta 0:03:59 lr 0.000837 wd 0.0500 time 0.6165 (0.5910) data time 0.0008 (0.0040) model time 0.6157 (0.5890) loss 7.7239 (7.5298) grad_norm 1.6328 (2.3562) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:40:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][230/625] eta 0:03:53 lr 0.000837 wd 0.0500 time 0.5748 (0.5905) data time 0.0006 (0.0039) model time 0.5742 (0.5885) loss 9.0765 (7.5359) grad_norm 2.2394 (2.3550) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:40:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][240/625] eta 0:03:47 lr 0.000837 wd 0.0500 time 0.5745 (0.5902) data time 0.0008 (0.0037) model time 0.5737 (0.5881) loss 8.0747 (7.5454) grad_norm 4.4371 (2.3703) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:40:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][250/625] eta 0:03:41 lr 0.000837 wd 0.0500 time 0.5737 (0.5896) data time 0.0006 (0.0036) model time 0.5731 (0.5874) loss 9.1321 (7.5548) grad_norm 1.9395 (2.3721) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:40:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][260/625] eta 0:03:34 lr 0.000837 wd 0.0500 time 0.5755 (0.5890) data time 0.0006 (0.0035) model time 0.5748 (0.5867) loss 7.0270 (7.5375) grad_norm 1.5568 (2.3628) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:41:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][270/625] eta 0:03:28 lr 0.000837 wd 0.0500 time 0.5753 (0.5885) data time 0.0006 (0.0034) model time 0.5747 (0.5862) loss 7.5593 (7.5342) grad_norm 1.7762 (2.3428) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:41:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][280/625] eta 0:03:22 lr 0.000837 wd 0.0500 time 0.5748 (0.5881) data time 0.0007 (0.0033) model time 0.5742 (0.5857) loss 5.6160 (7.5351) grad_norm 1.9669 (2.3285) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:41:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][290/625] eta 0:03:16 lr 0.000837 wd 0.0500 time 0.5746 (0.5878) data time 0.0008 (0.0033) model time 0.5738 (0.5854) loss 8.9243 (7.5277) grad_norm 2.0525 (2.3323) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:41:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][300/625] eta 0:03:10 lr 0.000837 wd 0.0500 time 0.5788 (0.5873) data time 0.0008 (0.0032) model time 0.5780 (0.5849) loss 7.4276 (7.5176) grad_norm 1.9024 (2.3253) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:41:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][310/625] eta 0:03:04 lr 0.000836 wd 0.0500 time 0.5741 (0.5869) data time 0.0008 (0.0031) model time 0.5733 (0.5845) loss 7.8369 (7.5229) grad_norm 1.9797 (2.3292) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:41:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][320/625] eta 0:02:58 lr 0.000836 wd 0.0500 time 0.5781 (0.5865) data time 0.0006 (0.0030) model time 0.5775 (0.5841) loss 9.4566 (7.5373) grad_norm 2.6384 (2.3429) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:41:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][330/625] eta 0:02:52 lr 0.000836 wd 0.0500 time 0.5804 (0.5862) data time 0.0008 (0.0030) model time 0.5796 (0.5838) loss 8.2729 (7.5516) grad_norm 2.1691 (2.3446) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:41:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][340/625] eta 0:02:47 lr 0.000836 wd 0.0500 time 0.5734 (0.5860) data time 0.0008 (0.0030) model time 0.5726 (0.5835) loss 7.5133 (7.5551) grad_norm 1.7883 (2.3309) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:41:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][350/625] eta 0:02:41 lr 0.000836 wd 0.0500 time 0.5747 (0.5861) data time 0.0008 (0.0029) model time 0.5739 (0.5837) loss 7.2728 (7.5369) grad_norm 2.0089 (2.3283) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:41:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][360/625] eta 0:02:35 lr 0.000836 wd 0.0500 time 0.5751 (0.5858) data time 0.0008 (0.0028) model time 0.5743 (0.5834) loss 6.3011 (7.5254) grad_norm 3.4381 (2.3352) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:42:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][370/625] eta 0:02:29 lr 0.000836 wd 0.0500 time 0.5758 (0.5855) data time 0.0006 (0.0028) model time 0.5752 (0.5831) loss 8.0838 (7.5297) grad_norm 1.7799 (2.3346) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:42:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][380/625] eta 0:02:23 lr 0.000836 wd 0.0500 time 0.5773 (0.5852) data time 0.0007 (0.0027) model time 0.5766 (0.5828) loss 6.8402 (7.5352) grad_norm 2.7274 (2.3330) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:42:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][390/625] eta 0:02:17 lr 0.000836 wd 0.0500 time 0.5766 (0.5850) data time 0.0008 (0.0027) model time 0.5758 (0.5826) loss 7.6741 (7.5403) grad_norm 2.2423 (2.3264) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:42:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][400/625] eta 0:02:11 lr 0.000836 wd 0.0500 time 0.5762 (0.5855) data time 0.0006 (0.0026) model time 0.5756 (0.5833) loss 5.8906 (7.5442) grad_norm 2.8923 (2.3196) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:42:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][410/625] eta 0:02:06 lr 0.000835 wd 0.0500 time 0.5820 (0.5862) data time 0.0006 (0.0026) model time 0.5813 (0.5841) loss 6.6171 (7.5475) grad_norm 1.6056 (2.3270) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:42:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][420/625] eta 0:02:00 lr 0.000835 wd 0.0500 time 0.5744 (0.5880) data time 0.0008 (0.0025) model time 0.5737 (0.5862) loss 7.3874 (7.5472) grad_norm 2.7334 (2.3245) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:42:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][430/625] eta 0:01:54 lr 0.000835 wd 0.0500 time 0.5755 (0.5883) data time 0.0008 (0.0025) model time 0.5747 (0.5865) loss 7.7170 (7.5437) grad_norm 2.6086 (2.3249) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:42:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][440/625] eta 0:01:48 lr 0.000835 wd 0.0500 time 0.5728 (0.5880) data time 0.0008 (0.0025) model time 0.5719 (0.5862) loss 8.3954 (7.5461) grad_norm 1.7111 (2.3222) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:42:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][450/625] eta 0:01:42 lr 0.000835 wd 0.0500 time 0.5732 (0.5877) data time 0.0006 (0.0024) model time 0.5726 (0.5859) loss 6.6376 (7.5475) grad_norm 2.7468 (2.3139) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:42:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][460/625] eta 0:01:36 lr 0.000835 wd 0.0500 time 0.5751 (0.5874) data time 0.0008 (0.0024) model time 0.5743 (0.5856) loss 7.6573 (7.5452) grad_norm 1.9094 (2.3108) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:43:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][470/625] eta 0:01:31 lr 0.000835 wd 0.0500 time 0.5785 (0.5871) data time 0.0008 (0.0024) model time 0.5776 (0.5853) loss 6.5614 (7.5455) grad_norm 1.8677 (2.3092) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:43:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][480/625] eta 0:01:25 lr 0.000835 wd 0.0500 time 0.5763 (0.5869) data time 0.0008 (0.0023) model time 0.5755 (0.5850) loss 9.4842 (7.5449) grad_norm 1.9443 (2.3107) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:43:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][490/625] eta 0:01:19 lr 0.000835 wd 0.0500 time 0.5776 (0.5866) data time 0.0008 (0.0023) model time 0.5768 (0.5848) loss 5.8751 (7.5493) grad_norm 1.7118 (2.3089) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:43:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][500/625] eta 0:01:13 lr 0.000835 wd 0.0500 time 0.5788 (0.5865) data time 0.0008 (0.0023) model time 0.5781 (0.5846) loss 7.7613 (7.5443) grad_norm 2.0582 (2.3034) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:43:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][510/625] eta 0:01:07 lr 0.000834 wd 0.0500 time 0.6097 (0.5863) data time 0.0010 (0.0023) model time 0.6088 (0.5845) loss 8.4666 (7.5533) grad_norm 1.6577 (2.2964) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:43:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][520/625] eta 0:01:01 lr 0.000834 wd 0.0500 time 0.5764 (0.5861) data time 0.0007 (0.0022) model time 0.5757 (0.5843) loss 6.7784 (7.5546) grad_norm 2.3441 (2.2982) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:43:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][530/625] eta 0:00:55 lr 0.000834 wd 0.0500 time 0.5924 (0.5860) data time 0.0008 (0.0022) model time 0.5915 (0.5841) loss 8.2845 (7.5501) grad_norm 2.0445 (2.2969) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:43:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][540/625] eta 0:00:49 lr 0.000834 wd 0.0500 time 0.5820 (0.5858) data time 0.0006 (0.0022) model time 0.5813 (0.5840) loss 7.0948 (7.5509) grad_norm 1.2821 (2.2935) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:43:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][550/625] eta 0:00:43 lr 0.000834 wd 0.0500 time 0.5754 (0.5857) data time 0.0006 (0.0021) model time 0.5748 (0.5839) loss 9.0530 (7.5551) grad_norm 2.1475 (2.2893) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:43:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][560/625] eta 0:00:38 lr 0.000834 wd 0.0500 time 0.5745 (0.5855) data time 0.0006 (0.0021) model time 0.5738 (0.5837) loss 7.4879 (7.5607) grad_norm 2.3377 (2.2909) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:43:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][570/625] eta 0:00:32 lr 0.000834 wd 0.0500 time 0.5754 (0.5856) data time 0.0008 (0.0021) model time 0.5746 (0.5839) loss 9.0702 (7.5713) grad_norm 2.1093 (2.2949) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:44:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][580/625] eta 0:00:26 lr 0.000834 wd 0.0500 time 0.5771 (0.5855) data time 0.0007 (0.0021) model time 0.5764 (0.5837) loss 6.5631 (7.5767) grad_norm 3.9836 (2.2951) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:44:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][590/625] eta 0:00:20 lr 0.000834 wd 0.0500 time 0.5842 (0.5854) data time 0.0006 (0.0021) model time 0.5836 (0.5836) loss 7.7296 (7.5749) grad_norm 2.6971 (2.2880) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:44:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][600/625] eta 0:00:14 lr 0.000834 wd 0.0500 time 0.5750 (0.5852) data time 0.0008 (0.0020) model time 0.5743 (0.5834) loss 9.2400 (7.5739) grad_norm 2.2373 (2.2909) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:44:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][610/625] eta 0:00:08 lr 0.000833 wd 0.0500 time 0.5743 (0.5850) data time 0.0004 (0.0020) model time 0.5739 (0.5832) loss 6.9487 (7.5719) grad_norm 2.6898 (2.2917) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:44:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [124/300][620/625] eta 0:00:02 lr 0.000833 wd 0.0500 time 0.7161 (0.5855) data time 0.0004 (0.0020) model time 0.7157 (0.5838) loss 8.8859 (7.5744) grad_norm 1.9098 (2.2901) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:44:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 124 training takes 0:06:05 +[2024-07-25 02:44:30 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 02:44:31 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 02:44:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.484 (0.484) Loss 0.5293 (0.5293) Acc@1 89.160 (89.160) Acc@5 98.535 (98.535) Mem 22339MB +[2024-07-25 02:44:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.159) Loss 0.8857 (0.6754) Acc@1 79.883 (85.463) Acc@5 95.215 (97.390) Mem 22339MB +[2024-07-25 02:44:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.9956 (0.7948) Acc@1 75.635 (82.036) Acc@5 94.287 (96.105) Mem 22339MB +[2024-07-25 02:44:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.694 Acc@5 96.053 +[2024-07-25 02:44:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.7% +[2024-07-25 02:44:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 1.019 (1.019) Loss 0.5024 (0.5024) Acc@1 89.258 (89.258) Acc@5 98.682 (98.682) Mem 22339MB +[2024-07-25 02:44:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.207) Loss 0.8027 (0.6336) Acc@1 81.006 (86.257) Acc@5 96.289 (97.723) Mem 22339MB +[2024-07-25 02:44:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.168) Loss 0.9185 (0.7436) Acc@1 76.953 (82.931) Acc@5 95.557 (96.587) Mem 22339MB +[2024-07-25 02:44:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.604 Acc@5 96.599 +[2024-07-25 02:44:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.6% +[2024-07-25 02:44:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.60% +[2024-07-25 02:44:39 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 02:44:40 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 02:44:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][0/625] eta 0:08:57 lr 0.000833 wd 0.0500 time 0.8596 (0.8596) data time 0.3406 (0.3406) model time 0.0000 (0.0000) loss 8.5932 (8.5932) grad_norm 1.8555 (1.8555) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:44:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][10/625] eta 0:06:50 lr 0.000833 wd 0.0500 time 0.7071 (0.6677) data time 0.0006 (0.0317) model time 0.0000 (0.0000) loss 7.8580 (7.5006) grad_norm 2.7972 (2.0643) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:44:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][20/625] eta 0:06:33 lr 0.000833 wd 0.0500 time 0.6968 (0.6503) data time 0.0008 (0.0170) model time 0.0000 (0.0000) loss 7.8224 (7.6119) grad_norm 1.6406 (2.3139) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:45:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][30/625] eta 0:06:16 lr 0.000833 wd 0.0500 time 0.5627 (0.6321) data time 0.0009 (0.0118) model time 0.0000 (0.0000) loss 6.6852 (7.6906) grad_norm 2.1067 (2.4003) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:45:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][40/625] eta 0:06:01 lr 0.000833 wd 0.0500 time 0.5691 (0.6187) data time 0.0006 (0.0091) model time 0.0000 (0.0000) loss 6.4833 (7.6355) grad_norm 2.9023 (2.3555) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:45:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][50/625] eta 0:05:50 lr 0.000833 wd 0.0500 time 0.5695 (0.6102) data time 0.0006 (0.0075) model time 0.0000 (0.0000) loss 7.4120 (7.6420) grad_norm 2.7610 (2.3551) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:45:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][60/625] eta 0:05:41 lr 0.000833 wd 0.0500 time 0.5627 (0.6043) data time 0.0008 (0.0064) model time 0.5618 (0.5737) loss 7.7604 (7.6571) grad_norm 1.7511 (2.3265) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:45:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][70/625] eta 0:05:33 lr 0.000833 wd 0.0500 time 0.5652 (0.6001) data time 0.0008 (0.0056) model time 0.5644 (0.5735) loss 8.4371 (7.6663) grad_norm 2.0814 (2.2841) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:45:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][80/625] eta 0:05:25 lr 0.000833 wd 0.0500 time 0.5728 (0.5971) data time 0.0006 (0.0050) model time 0.5722 (0.5740) loss 6.5496 (7.6114) grad_norm 1.8135 (2.2314) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:45:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][90/625] eta 0:05:18 lr 0.000832 wd 0.0500 time 0.5742 (0.5949) data time 0.0006 (0.0045) model time 0.5736 (0.5746) loss 7.8942 (7.5539) grad_norm 2.1455 (2.2313) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:45:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][100/625] eta 0:05:12 lr 0.000832 wd 0.0500 time 0.5192 (0.5943) data time 0.0009 (0.0042) model time 0.5183 (0.5774) loss 8.5501 (7.5568) grad_norm 1.9421 (2.2203) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:45:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][110/625] eta 0:05:05 lr 0.000832 wd 0.0500 time 0.5621 (0.5935) data time 0.0008 (0.0039) model time 0.5614 (0.5784) loss 7.4909 (7.5941) grad_norm 1.9861 (2.2416) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:45:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][120/625] eta 0:04:59 lr 0.000832 wd 0.0500 time 0.5712 (0.5923) data time 0.0008 (0.0036) model time 0.5704 (0.5785) loss 8.2083 (7.5904) grad_norm 2.3284 (2.2446) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:45:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][130/625] eta 0:04:52 lr 0.000832 wd 0.0500 time 0.5659 (0.5910) data time 0.0006 (0.0034) model time 0.5653 (0.5780) loss 8.7495 (7.6183) grad_norm 2.1706 (2.2363) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:46:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][140/625] eta 0:04:46 lr 0.000832 wd 0.0500 time 0.5651 (0.5902) data time 0.0006 (0.0032) model time 0.5645 (0.5781) loss 7.1585 (7.6109) grad_norm 2.6428 (2.2431) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 02:46:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][150/625] eta 0:04:39 lr 0.000832 wd 0.0500 time 0.5686 (0.5892) data time 0.0006 (0.0031) model time 0.5680 (0.5777) loss 6.5278 (7.5917) grad_norm 1.9519 (2.2300) loss_scale 2048.0000 (1030.7815) mem 22339MB +[2024-07-25 02:46:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][160/625] eta 0:04:33 lr 0.000832 wd 0.0500 time 0.5724 (0.5884) data time 0.0006 (0.0029) model time 0.5718 (0.5774) loss 6.1906 (7.5805) grad_norm 1.9423 (2.2446) loss_scale 2048.0000 (1093.9627) mem 22339MB +[2024-07-25 02:46:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][170/625] eta 0:04:27 lr 0.000832 wd 0.0500 time 0.5743 (0.5876) data time 0.0008 (0.0028) model time 0.5736 (0.5771) loss 9.0455 (7.5955) grad_norm 3.3695 (2.2530) loss_scale 2048.0000 (1149.7544) mem 22339MB +[2024-07-25 02:46:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][180/625] eta 0:04:21 lr 0.000832 wd 0.0500 time 0.5669 (0.5869) data time 0.0007 (0.0027) model time 0.5662 (0.5770) loss 8.1472 (7.6013) grad_norm 1.8319 (2.2469) loss_scale 2048.0000 (1199.3812) mem 22339MB +[2024-07-25 02:46:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][190/625] eta 0:04:15 lr 0.000831 wd 0.0500 time 0.5649 (0.5863) data time 0.0007 (0.0026) model time 0.5642 (0.5768) loss 7.4514 (7.6111) grad_norm 2.2311 (2.2397) loss_scale 2048.0000 (1243.8115) mem 22339MB +[2024-07-25 02:46:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][200/625] eta 0:04:08 lr 0.000831 wd 0.0500 time 0.5691 (0.5858) data time 0.0006 (0.0025) model time 0.5685 (0.5767) loss 8.1829 (7.6123) grad_norm 1.8038 (2.2446) loss_scale 2048.0000 (1283.8209) mem 22339MB +[2024-07-25 02:46:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][210/625] eta 0:04:03 lr 0.000831 wd 0.0500 time 0.7078 (0.5867) data time 0.0008 (0.0024) model time 0.7070 (0.5784) loss 7.2666 (7.6187) grad_norm 2.9781 (2.2778) loss_scale 2048.0000 (1320.0379) mem 22339MB +[2024-07-25 02:46:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][220/625] eta 0:03:58 lr 0.000831 wd 0.0500 time 0.5700 (0.5880) data time 0.0006 (0.0024) model time 0.5694 (0.5806) loss 7.9476 (7.5960) grad_norm 1.8363 (2.2829) loss_scale 2048.0000 (1352.9774) mem 22339MB +[2024-07-25 02:46:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][230/625] eta 0:03:53 lr 0.000831 wd 0.0500 time 0.7353 (0.5907) data time 0.0006 (0.0023) model time 0.7347 (0.5844) loss 7.2671 (7.6198) grad_norm 1.9864 (2.2842) loss_scale 2048.0000 (1383.0649) mem 22339MB +[2024-07-25 02:47:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][240/625] eta 0:03:48 lr 0.000831 wd 0.0500 time 0.5701 (0.5927) data time 0.0008 (0.0022) model time 0.5693 (0.5872) loss 6.4622 (7.5992) grad_norm 2.4920 (2.2794) loss_scale 2048.0000 (1410.6556) mem 22339MB +[2024-07-25 02:47:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][250/625] eta 0:03:42 lr 0.000831 wd 0.0500 time 0.5678 (0.5926) data time 0.0006 (0.0022) model time 0.5672 (0.5872) loss 7.4836 (7.5960) grad_norm 2.0014 (2.2667) loss_scale 2048.0000 (1436.0478) mem 22339MB +[2024-07-25 02:47:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][260/625] eta 0:03:36 lr 0.000831 wd 0.0500 time 0.5721 (0.5919) data time 0.0008 (0.0021) model time 0.5713 (0.5866) loss 5.3433 (7.5978) grad_norm 2.2836 (2.2656) loss_scale 2048.0000 (1459.4943) mem 22339MB +[2024-07-25 02:47:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][270/625] eta 0:03:29 lr 0.000831 wd 0.0500 time 0.5790 (0.5914) data time 0.0008 (0.0021) model time 0.5782 (0.5862) loss 7.2792 (7.6016) grad_norm 2.2995 (2.2751) loss_scale 2048.0000 (1481.2103) mem 22339MB +[2024-07-25 02:47:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][280/625] eta 0:03:23 lr 0.000831 wd 0.0500 time 0.5656 (0.5908) data time 0.0008 (0.0020) model time 0.5647 (0.5857) loss 7.7180 (7.6003) grad_norm 3.8533 (2.2774) loss_scale 2048.0000 (1501.3808) mem 22339MB +[2024-07-25 02:47:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][290/625] eta 0:03:17 lr 0.000830 wd 0.0500 time 0.5740 (0.5903) data time 0.0006 (0.0020) model time 0.5734 (0.5852) loss 8.9273 (7.6005) grad_norm 1.8107 (2.2862) loss_scale 2048.0000 (1520.1649) mem 22339MB +[2024-07-25 02:47:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][300/625] eta 0:03:11 lr 0.000830 wd 0.0500 time 0.5821 (0.5898) data time 0.0008 (0.0020) model time 0.5813 (0.5848) loss 6.6399 (7.6019) grad_norm 4.5637 (2.3126) loss_scale 2048.0000 (1537.7010) mem 22339MB +[2024-07-25 02:47:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][310/625] eta 0:03:05 lr 0.000830 wd 0.0500 time 0.5641 (0.5893) data time 0.0008 (0.0019) model time 0.5633 (0.5844) loss 6.9455 (7.6097) grad_norm 1.7159 (2.3158) loss_scale 2048.0000 (1554.1093) mem 22339MB +[2024-07-25 02:47:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][320/625] eta 0:02:59 lr 0.000830 wd 0.0500 time 0.5675 (0.5889) data time 0.0006 (0.0019) model time 0.5669 (0.5840) loss 6.2845 (7.5993) grad_norm 2.6414 (2.3133) loss_scale 2048.0000 (1569.4953) mem 22339MB +[2024-07-25 02:47:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][330/625] eta 0:02:53 lr 0.000830 wd 0.0500 time 0.5683 (0.5890) data time 0.0006 (0.0019) model time 0.5677 (0.5843) loss 8.1334 (7.6005) grad_norm 1.7233 (2.3008) loss_scale 2048.0000 (1583.9517) mem 22339MB +[2024-07-25 02:48:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][340/625] eta 0:02:47 lr 0.000830 wd 0.0500 time 0.5714 (0.5886) data time 0.0006 (0.0018) model time 0.5708 (0.5840) loss 7.9658 (7.5833) grad_norm 3.2189 (2.2987) loss_scale 2048.0000 (1597.5601) mem 22339MB +[2024-07-25 02:48:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][350/625] eta 0:02:41 lr 0.000830 wd 0.0500 time 0.5761 (0.5883) data time 0.0006 (0.0018) model time 0.5755 (0.5838) loss 8.9237 (7.5866) grad_norm 1.4900 (2.3016) loss_scale 2048.0000 (1610.3932) mem 22339MB +[2024-07-25 02:48:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][360/625] eta 0:02:35 lr 0.000830 wd 0.0500 time 0.5747 (0.5880) data time 0.0008 (0.0018) model time 0.5739 (0.5835) loss 8.0320 (7.5842) grad_norm 3.3626 (2.3037) loss_scale 2048.0000 (1622.5152) mem 22339MB +[2024-07-25 02:48:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][370/625] eta 0:02:29 lr 0.000830 wd 0.0500 time 0.5654 (0.5877) data time 0.0006 (0.0017) model time 0.5648 (0.5832) loss 7.6521 (7.5864) grad_norm 1.9044 (2.3181) loss_scale 2048.0000 (1633.9838) mem 22339MB +[2024-07-25 02:48:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][380/625] eta 0:02:23 lr 0.000830 wd 0.0500 time 0.5747 (0.5873) data time 0.0008 (0.0017) model time 0.5739 (0.5830) loss 9.1425 (7.6068) grad_norm 2.3214 (2.3199) loss_scale 2048.0000 (1644.8504) mem 22339MB +[2024-07-25 02:48:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][390/625] eta 0:02:17 lr 0.000829 wd 0.0500 time 0.5749 (0.5871) data time 0.0006 (0.0017) model time 0.5743 (0.5828) loss 7.5532 (7.6195) grad_norm 1.9172 (2.3155) loss_scale 2048.0000 (1655.1611) mem 22339MB +[2024-07-25 02:48:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][400/625] eta 0:02:12 lr 0.000829 wd 0.0500 time 0.5600 (0.5868) data time 0.0006 (0.0017) model time 0.5594 (0.5825) loss 7.8738 (7.6353) grad_norm 2.5063 (2.3158) loss_scale 2048.0000 (1664.9576) mem 22339MB +[2024-07-25 02:48:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][410/625] eta 0:02:06 lr 0.000829 wd 0.0500 time 0.5635 (0.5865) data time 0.0007 (0.0017) model time 0.5628 (0.5823) loss 7.0911 (7.6423) grad_norm 2.8786 (2.3080) loss_scale 2048.0000 (1674.2774) mem 22339MB +[2024-07-25 02:48:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][420/625] eta 0:02:00 lr 0.000829 wd 0.0500 time 0.5730 (0.5863) data time 0.0008 (0.0016) model time 0.5722 (0.5822) loss 7.2517 (7.6276) grad_norm 1.9444 (2.3082) loss_scale 2048.0000 (1683.1544) mem 22339MB +[2024-07-25 02:48:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][430/625] eta 0:01:54 lr 0.000829 wd 0.0500 time 0.5711 (0.5867) data time 0.0008 (0.0016) model time 0.5703 (0.5827) loss 8.6529 (7.6221) grad_norm 1.6590 (2.3145) loss_scale 2048.0000 (1691.6195) mem 22339MB +[2024-07-25 02:48:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][440/625] eta 0:01:48 lr 0.000829 wd 0.0500 time 0.6846 (0.5874) data time 0.0006 (0.0016) model time 0.6840 (0.5836) loss 6.7807 (7.6279) grad_norm 2.3325 (2.3109) loss_scale 2048.0000 (1699.7007) mem 22339MB +[2024-07-25 02:49:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][450/625] eta 0:01:42 lr 0.000829 wd 0.0500 time 0.7469 (0.5884) data time 0.0009 (0.0016) model time 0.7460 (0.5848) loss 7.5155 (7.6226) grad_norm 2.6549 (2.3181) loss_scale 2048.0000 (1707.4235) mem 22339MB +[2024-07-25 02:49:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][460/625] eta 0:01:37 lr 0.000829 wd 0.0500 time 0.5693 (0.5892) data time 0.0007 (0.0016) model time 0.5686 (0.5857) loss 8.7926 (7.6357) grad_norm 2.6195 (2.3241) loss_scale 2048.0000 (1714.8113) mem 22339MB +[2024-07-25 02:49:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][470/625] eta 0:01:31 lr 0.000829 wd 0.0500 time 0.5725 (0.5895) data time 0.0007 (0.0016) model time 0.5718 (0.5861) loss 7.6358 (7.6368) grad_norm 1.8985 (2.3273) loss_scale 2048.0000 (1721.8854) mem 22339MB +[2024-07-25 02:49:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][480/625] eta 0:01:25 lr 0.000829 wd 0.0500 time 0.5700 (0.5892) data time 0.0008 (0.0015) model time 0.5692 (0.5859) loss 7.4087 (7.6357) grad_norm 1.9585 (2.3292) loss_scale 2048.0000 (1728.6653) mem 22339MB +[2024-07-25 02:49:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][490/625] eta 0:01:19 lr 0.000828 wd 0.0500 time 0.5745 (0.5890) data time 0.0006 (0.0016) model time 0.5739 (0.5857) loss 7.2760 (7.6341) grad_norm 2.7265 (2.3229) loss_scale 2048.0000 (1735.1690) mem 22339MB +[2024-07-25 02:49:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][500/625] eta 0:01:13 lr 0.000828 wd 0.0500 time 0.5606 (0.5887) data time 0.0007 (0.0015) model time 0.5599 (0.5854) loss 8.5400 (7.6327) grad_norm 1.4705 (2.3208) loss_scale 2048.0000 (1741.4132) mem 22339MB +[2024-07-25 02:49:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][510/625] eta 0:01:07 lr 0.000828 wd 0.0500 time 0.5767 (0.5885) data time 0.0008 (0.0015) model time 0.5759 (0.5853) loss 8.2098 (7.6367) grad_norm 2.4971 (2.3349) loss_scale 2048.0000 (1747.4129) mem 22339MB +[2024-07-25 02:49:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][520/625] eta 0:01:01 lr 0.000828 wd 0.0500 time 0.5608 (0.5883) data time 0.0006 (0.0015) model time 0.5602 (0.5850) loss 7.9935 (7.6437) grad_norm 1.9217 (2.3302) loss_scale 2048.0000 (1753.1823) mem 22339MB +[2024-07-25 02:49:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][530/625] eta 0:00:55 lr 0.000828 wd 0.0500 time 0.5678 (0.5881) data time 0.0008 (0.0015) model time 0.5670 (0.5849) loss 8.5895 (7.6527) grad_norm 1.7613 (2.3360) loss_scale 2048.0000 (1758.7345) mem 22339MB +[2024-07-25 02:49:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][540/625] eta 0:00:50 lr 0.000828 wd 0.0500 time 0.7940 (0.5884) data time 0.0006 (0.0015) model time 0.7934 (0.5852) loss 10.2692 (7.6542) grad_norm 2.9008 (2.3389) loss_scale 2048.0000 (1764.0813) mem 22339MB +[2024-07-25 02:50:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][550/625] eta 0:00:44 lr 0.000828 wd 0.0500 time 0.5699 (0.5881) data time 0.0006 (0.0015) model time 0.5693 (0.5849) loss 6.1805 (7.6472) grad_norm 2.8805 (2.3418) loss_scale 2048.0000 (1769.2341) mem 22339MB +[2024-07-25 02:50:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][560/625] eta 0:00:38 lr 0.000828 wd 0.0500 time 0.5742 (0.5878) data time 0.0008 (0.0015) model time 0.5734 (0.5847) loss 6.8717 (7.6457) grad_norm 2.3441 (2.3369) loss_scale 2048.0000 (1774.2032) mem 22339MB +[2024-07-25 02:50:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][570/625] eta 0:00:32 lr 0.000828 wd 0.0500 time 0.5736 (0.5876) data time 0.0008 (0.0015) model time 0.5727 (0.5845) loss 7.5700 (7.6524) grad_norm 1.7388 (2.3397) loss_scale 2048.0000 (1778.9982) mem 22339MB +[2024-07-25 02:50:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][580/625] eta 0:00:26 lr 0.000828 wd 0.0500 time 0.5681 (0.5874) data time 0.0009 (0.0014) model time 0.5673 (0.5844) loss 8.7261 (7.6625) grad_norm 1.8877 (2.3403) loss_scale 2048.0000 (1783.6282) mem 22339MB +[2024-07-25 02:50:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][590/625] eta 0:00:20 lr 0.000827 wd 0.0500 time 0.5691 (0.5873) data time 0.0006 (0.0014) model time 0.5685 (0.5842) loss 6.1683 (7.6619) grad_norm 3.9313 (2.3533) loss_scale 2048.0000 (1788.1015) mem 22339MB +[2024-07-25 02:50:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][600/625] eta 0:00:14 lr 0.000827 wd 0.0500 time 0.5682 (0.5871) data time 0.0007 (0.0014) model time 0.5675 (0.5841) loss 6.3452 (7.6610) grad_norm 2.5090 (2.3540) loss_scale 2048.0000 (1792.4260) mem 22339MB +[2024-07-25 02:50:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][610/625] eta 0:00:08 lr 0.000827 wd 0.0500 time 0.5707 (0.5869) data time 0.0004 (0.0014) model time 0.5702 (0.5839) loss 6.3906 (7.6634) grad_norm 1.8134 (2.3518) loss_scale 2048.0000 (1796.6088) mem 22339MB +[2024-07-25 02:50:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [125/300][620/625] eta 0:00:02 lr 0.000827 wd 0.0500 time 0.5662 (0.5867) data time 0.0004 (0.0014) model time 0.5657 (0.5837) loss 8.2978 (7.6674) grad_norm 1.9574 (2.3450) loss_scale 2048.0000 (1800.6570) mem 22339MB +[2024-07-25 02:50:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 125 training takes 0:06:06 +[2024-07-25 02:50:47 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 02:50:48 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 02:50:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.479 (0.479) Loss 0.5322 (0.5322) Acc@1 88.916 (88.916) Acc@5 98.340 (98.340) Mem 22339MB +[2024-07-25 02:50:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.159) Loss 0.8462 (0.6663) Acc@1 79.883 (85.662) Acc@5 95.996 (97.563) Mem 22339MB +[2024-07-25 02:50:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.9883 (0.7937) Acc@1 75.635 (82.182) Acc@5 94.092 (96.168) Mem 22339MB +[2024-07-25 02:50:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.894 Acc@5 96.169 +[2024-07-25 02:50:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.9% +[2024-07-25 02:50:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.853 (0.853) Loss 0.5024 (0.5024) Acc@1 89.209 (89.209) Acc@5 98.633 (98.633) Mem 22339MB +[2024-07-25 02:50:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.192) Loss 0.8013 (0.6330) Acc@1 81.006 (86.315) Acc@5 96.289 (97.710) Mem 22339MB +[2024-07-25 02:50:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.160) Loss 0.9180 (0.7429) Acc@1 77.100 (83.003) Acc@5 95.361 (96.577) Mem 22339MB +[2024-07-25 02:50:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.666 Acc@5 96.587 +[2024-07-25 02:50:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.7% +[2024-07-25 02:50:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.67% +[2024-07-25 02:50:56 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 02:50:57 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 02:50:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][0/625] eta 0:09:45 lr 0.000827 wd 0.0500 time 0.9362 (0.9362) data time 0.4169 (0.4169) model time 0.0000 (0.0000) loss 9.0612 (9.0612) grad_norm 2.4768 (2.4768) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:51:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][10/625] eta 0:06:13 lr 0.000827 wd 0.0500 time 0.5748 (0.6070) data time 0.0006 (0.0387) model time 0.0000 (0.0000) loss 8.4380 (8.1191) grad_norm 4.9596 (2.4721) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:51:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][20/625] eta 0:05:58 lr 0.000827 wd 0.0500 time 0.5652 (0.5932) data time 0.0008 (0.0206) model time 0.0000 (0.0000) loss 7.8977 (8.0084) grad_norm 2.3961 (2.4326) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:51:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][30/625] eta 0:06:01 lr 0.000827 wd 0.0500 time 0.7296 (0.6078) data time 0.0008 (0.0142) model time 0.0000 (0.0000) loss 6.9946 (7.9034) grad_norm 2.3880 (2.4091) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:51:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][40/625] eta 0:05:57 lr 0.000827 wd 0.0500 time 0.7509 (0.6114) data time 0.0008 (0.0110) model time 0.0000 (0.0000) loss 7.5979 (7.7637) grad_norm 2.0845 (2.3275) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:51:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][50/625] eta 0:05:53 lr 0.000827 wd 0.0500 time 0.7309 (0.6153) data time 0.0008 (0.0090) model time 0.0000 (0.0000) loss 7.4022 (7.6826) grad_norm 2.7969 (2.3986) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:51:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][60/625] eta 0:05:48 lr 0.000827 wd 0.0500 time 0.5685 (0.6169) data time 0.0006 (0.0076) model time 0.5679 (0.6242) loss 6.0315 (7.7375) grad_norm 1.7287 (2.3688) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:51:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][70/625] eta 0:05:40 lr 0.000826 wd 0.0500 time 0.5199 (0.6130) data time 0.0009 (0.0067) model time 0.5190 (0.6062) loss 8.4969 (7.6962) grad_norm 3.1349 (2.3376) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:51:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][80/625] eta 0:05:31 lr 0.000826 wd 0.0500 time 0.5728 (0.6087) data time 0.0006 (0.0060) model time 0.5722 (0.5967) loss 8.2410 (7.7135) grad_norm 1.9018 (2.3030) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:51:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][90/625] eta 0:05:23 lr 0.000826 wd 0.0500 time 0.5665 (0.6052) data time 0.0007 (0.0054) model time 0.5658 (0.5915) loss 8.5602 (7.6665) grad_norm 1.7004 (2.2944) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:51:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][100/625] eta 0:05:16 lr 0.000826 wd 0.0500 time 0.5742 (0.6025) data time 0.0007 (0.0049) model time 0.5735 (0.5887) loss 7.6384 (7.6366) grad_norm 1.8045 (2.3047) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:52:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][110/625] eta 0:05:09 lr 0.000826 wd 0.0500 time 0.5624 (0.6001) data time 0.0008 (0.0045) model time 0.5616 (0.5864) loss 8.5534 (7.6425) grad_norm 2.2680 (2.3094) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:52:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][120/625] eta 0:05:02 lr 0.000826 wd 0.0500 time 0.5738 (0.5982) data time 0.0008 (0.0042) model time 0.5731 (0.5849) loss 7.8785 (7.6343) grad_norm 1.6581 (2.2741) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:52:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][130/625] eta 0:04:55 lr 0.000826 wd 0.0500 time 0.5636 (0.5966) data time 0.0006 (0.0040) model time 0.5630 (0.5838) loss 7.2593 (7.6502) grad_norm 1.8844 (2.2475) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:52:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][140/625] eta 0:04:48 lr 0.000826 wd 0.0500 time 0.5718 (0.5950) data time 0.0007 (0.0038) model time 0.5710 (0.5827) loss 7.9537 (7.6843) grad_norm 2.1861 (2.2444) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:52:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][150/625] eta 0:04:41 lr 0.000826 wd 0.0500 time 0.5715 (0.5937) data time 0.0008 (0.0036) model time 0.5708 (0.5818) loss 8.5142 (7.6649) grad_norm 1.8211 (2.2328) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:52:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][160/625] eta 0:04:35 lr 0.000826 wd 0.0500 time 0.5729 (0.5925) data time 0.0006 (0.0034) model time 0.5722 (0.5811) loss 7.7283 (7.6770) grad_norm 1.4949 (2.2333) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:52:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][170/625] eta 0:04:29 lr 0.000825 wd 0.0500 time 0.5695 (0.5915) data time 0.0006 (0.0033) model time 0.5689 (0.5805) loss 7.1100 (7.6450) grad_norm 2.9122 (2.2459) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:52:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][180/625] eta 0:04:22 lr 0.000825 wd 0.0500 time 0.5688 (0.5905) data time 0.0008 (0.0031) model time 0.5680 (0.5799) loss 7.1181 (7.6397) grad_norm 1.8566 (2.2454) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:52:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][190/625] eta 0:04:16 lr 0.000825 wd 0.0500 time 0.5637 (0.5897) data time 0.0006 (0.0030) model time 0.5631 (0.5795) loss 6.2765 (7.5973) grad_norm 2.0680 (2.2319) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:52:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][200/625] eta 0:04:10 lr 0.000825 wd 0.0500 time 0.5716 (0.5889) data time 0.0006 (0.0029) model time 0.5709 (0.5791) loss 8.5412 (7.5903) grad_norm 1.6062 (2.2168) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:53:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][210/625] eta 0:04:04 lr 0.000825 wd 0.0500 time 0.5691 (0.5883) data time 0.0008 (0.0028) model time 0.5683 (0.5789) loss 7.5483 (7.5866) grad_norm 2.1880 (2.2121) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:53:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][220/625] eta 0:03:58 lr 0.000825 wd 0.0500 time 0.5721 (0.5878) data time 0.0006 (0.0027) model time 0.5715 (0.5787) loss 7.5507 (7.5843) grad_norm 1.8882 (2.2100) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:53:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][230/625] eta 0:03:51 lr 0.000825 wd 0.0500 time 0.5745 (0.5872) data time 0.0008 (0.0026) model time 0.5737 (0.5784) loss 8.9173 (7.5822) grad_norm 1.6129 (2.2061) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:53:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][240/625] eta 0:03:45 lr 0.000825 wd 0.0500 time 0.5709 (0.5867) data time 0.0006 (0.0026) model time 0.5703 (0.5783) loss 7.5106 (7.5992) grad_norm 1.7229 (2.1973) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:53:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][250/625] eta 0:03:40 lr 0.000825 wd 0.0500 time 0.6796 (0.5887) data time 0.0007 (0.0025) model time 0.6789 (0.5811) loss 7.3551 (7.5843) grad_norm 1.7426 (2.2038) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:53:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][260/625] eta 0:03:35 lr 0.000825 wd 0.0500 time 0.5665 (0.5898) data time 0.0007 (0.0024) model time 0.5658 (0.5827) loss 7.9385 (7.5958) grad_norm 1.9141 (2.2062) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:53:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][270/625] eta 0:03:30 lr 0.000824 wd 0.0500 time 0.7616 (0.5925) data time 0.0008 (0.0024) model time 0.7608 (0.5864) loss 6.2689 (7.5862) grad_norm 2.5776 (2.2048) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:53:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][280/625] eta 0:03:24 lr 0.000824 wd 0.0500 time 0.5699 (0.5936) data time 0.0008 (0.0023) model time 0.5691 (0.5880) loss 8.2679 (7.5944) grad_norm 2.4046 (2.2153) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:53:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][290/625] eta 0:03:18 lr 0.000824 wd 0.0500 time 0.5719 (0.5929) data time 0.0006 (0.0022) model time 0.5713 (0.5874) loss 7.9217 (7.5996) grad_norm 2.0588 (2.2149) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:53:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][300/625] eta 0:03:12 lr 0.000824 wd 0.0500 time 0.5619 (0.5928) data time 0.0008 (0.0022) model time 0.5611 (0.5874) loss 8.6493 (7.6085) grad_norm 1.9540 (2.2171) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:54:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][310/625] eta 0:03:06 lr 0.000824 wd 0.0500 time 0.5858 (0.5923) data time 0.0006 (0.0022) model time 0.5852 (0.5869) loss 6.7503 (7.6083) grad_norm 2.8210 (2.2149) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:54:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][320/625] eta 0:03:00 lr 0.000824 wd 0.0500 time 0.5711 (0.5917) data time 0.0008 (0.0021) model time 0.5704 (0.5864) loss 6.5470 (7.6063) grad_norm 1.7958 (2.2029) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:54:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][330/625] eta 0:02:54 lr 0.000824 wd 0.0500 time 0.5772 (0.5912) data time 0.0006 (0.0021) model time 0.5766 (0.5860) loss 7.7236 (7.5905) grad_norm 1.9355 (2.2013) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:54:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][340/625] eta 0:02:48 lr 0.000824 wd 0.0500 time 0.5698 (0.5907) data time 0.0006 (0.0020) model time 0.5692 (0.5856) loss 5.1465 (7.5848) grad_norm 1.7213 (2.1952) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:54:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][350/625] eta 0:02:42 lr 0.000824 wd 0.0500 time 0.5716 (0.5902) data time 0.0006 (0.0020) model time 0.5710 (0.5852) loss 5.6760 (7.5929) grad_norm 1.6623 (2.1862) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:54:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][360/625] eta 0:02:36 lr 0.000824 wd 0.0500 time 0.5616 (0.5898) data time 0.0008 (0.0020) model time 0.5608 (0.5848) loss 6.5848 (7.5927) grad_norm 2.3796 (2.1818) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:54:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][370/625] eta 0:02:30 lr 0.000823 wd 0.0500 time 0.5714 (0.5894) data time 0.0006 (0.0019) model time 0.5708 (0.5844) loss 6.2624 (7.5966) grad_norm 1.4294 (2.1755) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:54:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][380/625] eta 0:02:24 lr 0.000823 wd 0.0500 time 0.5743 (0.5890) data time 0.0007 (0.0019) model time 0.5736 (0.5841) loss 8.7674 (7.5806) grad_norm 2.0976 (2.1743) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:54:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][390/625] eta 0:02:18 lr 0.000823 wd 0.0500 time 0.5802 (0.5887) data time 0.0008 (0.0019) model time 0.5793 (0.5839) loss 6.7809 (7.5759) grad_norm 1.6299 (2.1714) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:54:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][400/625] eta 0:02:12 lr 0.000823 wd 0.0500 time 0.5647 (0.5883) data time 0.0008 (0.0019) model time 0.5639 (0.5836) loss 7.1216 (7.5693) grad_norm 2.1223 (2.1781) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:54:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][410/625] eta 0:02:06 lr 0.000823 wd 0.0500 time 0.5653 (0.5880) data time 0.0007 (0.0018) model time 0.5646 (0.5833) loss 9.0239 (7.5810) grad_norm 2.4179 (2.1912) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:55:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][420/625] eta 0:02:00 lr 0.000823 wd 0.0500 time 0.5658 (0.5877) data time 0.0006 (0.0018) model time 0.5653 (0.5830) loss 8.4979 (7.5844) grad_norm 2.5221 (2.2002) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:55:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][430/625] eta 0:01:54 lr 0.000823 wd 0.0500 time 0.5751 (0.5874) data time 0.0008 (0.0018) model time 0.5743 (0.5828) loss 6.9784 (7.5798) grad_norm 1.9834 (2.2067) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:55:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][440/625] eta 0:01:48 lr 0.000823 wd 0.0500 time 0.5722 (0.5871) data time 0.0008 (0.0018) model time 0.5714 (0.5826) loss 5.4848 (7.5763) grad_norm 1.4169 (2.2083) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:55:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][450/625] eta 0:01:42 lr 0.000823 wd 0.0500 time 0.5722 (0.5869) data time 0.0006 (0.0017) model time 0.5716 (0.5824) loss 8.0497 (7.5784) grad_norm 1.6897 (2.2107) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:55:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][460/625] eta 0:01:36 lr 0.000823 wd 0.0500 time 0.5665 (0.5867) data time 0.0006 (0.0017) model time 0.5659 (0.5823) loss 8.5106 (7.5863) grad_norm 3.5185 (2.2146) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:55:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][470/625] eta 0:01:31 lr 0.000822 wd 0.0500 time 0.5697 (0.5874) data time 0.0008 (0.0017) model time 0.5689 (0.5832) loss 6.4835 (7.5835) grad_norm 1.7369 (2.2111) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:55:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][480/625] eta 0:01:25 lr 0.000822 wd 0.0500 time 0.5655 (0.5880) data time 0.0006 (0.0017) model time 0.5649 (0.5840) loss 7.1279 (7.5822) grad_norm 2.2880 (2.2059) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:55:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][490/625] eta 0:01:19 lr 0.000822 wd 0.0500 time 0.7409 (0.5895) data time 0.0008 (0.0017) model time 0.7401 (0.5857) loss 8.8015 (7.5894) grad_norm 2.9318 (2.2064) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:55:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][500/625] eta 0:01:13 lr 0.000822 wd 0.0500 time 0.5729 (0.5898) data time 0.0006 (0.0017) model time 0.5723 (0.5861) loss 5.6836 (7.5877) grad_norm 2.3907 (2.2067) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:55:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][510/625] eta 0:01:07 lr 0.000822 wd 0.0500 time 0.7663 (0.5899) data time 0.0006 (0.0016) model time 0.7658 (0.5862) loss 9.5053 (7.6014) grad_norm 1.7461 (2.2053) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:56:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][520/625] eta 0:01:01 lr 0.000822 wd 0.0500 time 0.5656 (0.5895) data time 0.0006 (0.0016) model time 0.5650 (0.5858) loss 6.8296 (7.6049) grad_norm 2.7898 (2.2089) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:56:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][530/625] eta 0:00:55 lr 0.000822 wd 0.0500 time 0.5733 (0.5892) data time 0.0008 (0.0016) model time 0.5724 (0.5856) loss 6.6131 (7.5984) grad_norm 2.0559 (2.2106) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:56:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][540/625] eta 0:00:50 lr 0.000822 wd 0.0500 time 0.5763 (0.5891) data time 0.0006 (0.0016) model time 0.5757 (0.5856) loss 5.9308 (7.5963) grad_norm 2.2330 (2.2058) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:56:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][550/625] eta 0:00:44 lr 0.000822 wd 0.0500 time 0.5628 (0.5889) data time 0.0007 (0.0016) model time 0.5621 (0.5853) loss 8.7200 (7.5896) grad_norm 2.0603 (2.2048) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:56:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][560/625] eta 0:00:38 lr 0.000822 wd 0.0500 time 0.5718 (0.5886) data time 0.0010 (0.0016) model time 0.5708 (0.5851) loss 8.9599 (7.5833) grad_norm 1.8569 (2.2027) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:56:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][570/625] eta 0:00:32 lr 0.000821 wd 0.0500 time 0.5635 (0.5884) data time 0.0008 (0.0016) model time 0.5627 (0.5850) loss 7.9569 (7.5888) grad_norm 2.3663 (2.2065) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:56:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][580/625] eta 0:00:26 lr 0.000821 wd 0.0500 time 0.5740 (0.5883) data time 0.0006 (0.0015) model time 0.5734 (0.5848) loss 7.2577 (7.5825) grad_norm 1.8371 (2.2083) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:56:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][590/625] eta 0:00:20 lr 0.000821 wd 0.0500 time 0.5709 (0.5880) data time 0.0008 (0.0015) model time 0.5701 (0.5846) loss 7.1299 (7.5858) grad_norm 1.9617 (2.2081) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:56:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][600/625] eta 0:00:14 lr 0.000821 wd 0.0500 time 0.5680 (0.5879) data time 0.0010 (0.0015) model time 0.5670 (0.5845) loss 8.5237 (7.5884) grad_norm 1.7794 (2.2082) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:56:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][610/625] eta 0:00:08 lr 0.000821 wd 0.0500 time 0.5710 (0.5877) data time 0.0006 (0.0015) model time 0.5705 (0.5843) loss 6.0800 (7.5869) grad_norm 2.1430 (2.2116) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:57:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [126/300][620/625] eta 0:00:02 lr 0.000821 wd 0.0500 time 0.5739 (0.5875) data time 0.0006 (0.0015) model time 0.5734 (0.5842) loss 7.1102 (7.5852) grad_norm 2.6677 (2.2175) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:57:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 126 training takes 0:06:07 +[2024-07-25 02:57:05 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 02:57:06 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 02:57:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.468 (0.468) Loss 0.5322 (0.5322) Acc@1 88.477 (88.477) Acc@5 98.633 (98.633) Mem 22339MB +[2024-07-25 02:57:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.157) Loss 0.8340 (0.6723) Acc@1 81.201 (85.618) Acc@5 96.484 (97.581) Mem 22339MB +[2024-07-25 02:57:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9937 (0.7925) Acc@1 76.514 (82.227) Acc@5 94.385 (96.177) Mem 22339MB +[2024-07-25 02:57:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.948 Acc@5 96.157 +[2024-07-25 02:57:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.9% +[2024-07-25 02:57:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 1.102 (1.102) Loss 0.5020 (0.5020) Acc@1 89.160 (89.160) Acc@5 98.633 (98.633) Mem 22339MB +[2024-07-25 02:57:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.214) Loss 0.8008 (0.6327) Acc@1 81.055 (86.297) Acc@5 96.289 (97.705) Mem 22339MB +[2024-07-25 02:57:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.172) Loss 0.9185 (0.7425) Acc@1 77.344 (83.005) Acc@5 95.459 (96.591) Mem 22339MB +[2024-07-25 02:57:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.672 Acc@5 96.597 +[2024-07-25 02:57:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.7% +[2024-07-25 02:57:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.67% +[2024-07-25 02:57:14 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 02:57:15 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 02:57:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][0/625] eta 0:09:46 lr 0.000821 wd 0.0500 time 0.9381 (0.9381) data time 0.4190 (0.4190) model time 0.0000 (0.0000) loss 8.6591 (8.6591) grad_norm 1.8511 (1.8511) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:57:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][10/625] eta 0:06:15 lr 0.000821 wd 0.0500 time 0.5695 (0.6098) data time 0.0006 (0.0388) model time 0.0000 (0.0000) loss 8.0019 (7.7551) grad_norm 1.9162 (2.2011) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:57:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][20/625] eta 0:05:59 lr 0.000821 wd 0.0500 time 0.5724 (0.5949) data time 0.0008 (0.0207) model time 0.0000 (0.0000) loss 7.7627 (7.6475) grad_norm 1.5139 (2.1315) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:57:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][30/625] eta 0:05:51 lr 0.000821 wd 0.0500 time 0.5681 (0.5903) data time 0.0008 (0.0143) model time 0.0000 (0.0000) loss 8.5507 (7.6762) grad_norm 2.4667 (2.1343) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:57:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][40/625] eta 0:05:44 lr 0.000821 wd 0.0500 time 0.5609 (0.5887) data time 0.0006 (0.0110) model time 0.0000 (0.0000) loss 6.6213 (7.6116) grad_norm 2.4869 (2.1537) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:57:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][50/625] eta 0:05:38 lr 0.000820 wd 0.0500 time 0.5611 (0.5881) data time 0.0008 (0.0090) model time 0.0000 (0.0000) loss 7.4866 (7.5897) grad_norm 2.3147 (2.1717) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:57:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][60/625] eta 0:05:31 lr 0.000820 wd 0.0500 time 0.5720 (0.5866) data time 0.0008 (0.0077) model time 0.5712 (0.5778) loss 7.8687 (7.5336) grad_norm 2.0508 (2.1609) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:57:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][70/625] eta 0:05:28 lr 0.000820 wd 0.0500 time 0.7779 (0.5916) data time 0.0008 (0.0067) model time 0.7770 (0.5997) loss 5.9642 (7.5563) grad_norm 3.4874 (2.1945) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:58:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][80/625] eta 0:05:23 lr 0.000820 wd 0.0500 time 0.7404 (0.5932) data time 0.0008 (0.0060) model time 0.7396 (0.6010) loss 6.8411 (7.5310) grad_norm 3.4763 (2.3050) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:58:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][90/625] eta 0:05:19 lr 0.000820 wd 0.0500 time 0.7391 (0.5981) data time 0.0007 (0.0054) model time 0.7385 (0.6100) loss 8.4976 (7.5501) grad_norm 3.2865 (2.3423) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:58:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][100/625] eta 0:05:13 lr 0.000820 wd 0.0500 time 0.5715 (0.5972) data time 0.0008 (0.0050) model time 0.5707 (0.6057) loss 9.5816 (7.5785) grad_norm 1.8853 (2.3199) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:58:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][110/625] eta 0:05:06 lr 0.000820 wd 0.0500 time 0.5719 (0.5953) data time 0.0008 (0.0046) model time 0.5711 (0.6005) loss 7.8711 (7.5863) grad_norm 1.8215 (2.3010) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:58:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][120/625] eta 0:04:59 lr 0.000820 wd 0.0500 time 0.5712 (0.5936) data time 0.0006 (0.0043) model time 0.5706 (0.5968) loss 7.1491 (7.5622) grad_norm 1.7153 (2.2577) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:58:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][130/625] eta 0:04:53 lr 0.000820 wd 0.0500 time 0.5727 (0.5925) data time 0.0008 (0.0040) model time 0.5719 (0.5944) loss 7.4929 (7.5643) grad_norm 2.1547 (2.2400) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:58:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][140/625] eta 0:04:46 lr 0.000820 wd 0.0500 time 0.5724 (0.5916) data time 0.0008 (0.0038) model time 0.5716 (0.5927) loss 7.9834 (7.5741) grad_norm 2.0419 (2.2224) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:58:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][150/625] eta 0:04:40 lr 0.000819 wd 0.0500 time 0.5701 (0.5911) data time 0.0006 (0.0036) model time 0.5695 (0.5917) loss 7.7620 (7.5746) grad_norm 1.7461 (2.2039) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:58:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][160/625] eta 0:04:34 lr 0.000819 wd 0.0500 time 0.5617 (0.5901) data time 0.0008 (0.0034) model time 0.5609 (0.5901) loss 5.6085 (7.5511) grad_norm 1.6968 (2.1961) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:58:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][170/625] eta 0:04:28 lr 0.000819 wd 0.0500 time 0.5715 (0.5892) data time 0.0008 (0.0033) model time 0.5707 (0.5889) loss 8.4445 (7.5534) grad_norm 2.3642 (2.1819) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:59:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][180/625] eta 0:04:21 lr 0.000819 wd 0.0500 time 0.5736 (0.5885) data time 0.0006 (0.0031) model time 0.5730 (0.5878) loss 8.5842 (7.5499) grad_norm 1.9680 (2.1751) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:59:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][190/625] eta 0:04:15 lr 0.000819 wd 0.0500 time 0.5686 (0.5879) data time 0.0006 (0.0030) model time 0.5680 (0.5870) loss 7.4143 (7.5747) grad_norm 1.6903 (2.1587) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:59:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][200/625] eta 0:04:09 lr 0.000819 wd 0.0500 time 0.5713 (0.5873) data time 0.0006 (0.0029) model time 0.5707 (0.5862) loss 6.7727 (7.5686) grad_norm 1.8591 (2.1625) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:59:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][210/625] eta 0:04:03 lr 0.000819 wd 0.0500 time 0.5707 (0.5868) data time 0.0008 (0.0028) model time 0.5699 (0.5856) loss 8.8926 (7.5811) grad_norm 1.8695 (2.1474) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:59:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][220/625] eta 0:03:57 lr 0.000819 wd 0.0500 time 0.5708 (0.5863) data time 0.0006 (0.0027) model time 0.5703 (0.5849) loss 8.9641 (7.5787) grad_norm 3.3846 (2.1546) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:59:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][230/625] eta 0:03:51 lr 0.000819 wd 0.0500 time 0.5683 (0.5860) data time 0.0006 (0.0026) model time 0.5677 (0.5845) loss 8.2763 (7.5898) grad_norm 2.1747 (2.1830) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:59:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][240/625] eta 0:03:45 lr 0.000819 wd 0.0500 time 0.5643 (0.5856) data time 0.0006 (0.0026) model time 0.5636 (0.5841) loss 7.7334 (7.5873) grad_norm 3.5774 (2.2091) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:59:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][250/625] eta 0:03:39 lr 0.000818 wd 0.0500 time 0.5735 (0.5852) data time 0.0007 (0.0025) model time 0.5729 (0.5837) loss 6.3469 (7.5798) grad_norm 2.4541 (2.2292) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:59:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][260/625] eta 0:03:33 lr 0.000818 wd 0.0500 time 0.5693 (0.5849) data time 0.0006 (0.0024) model time 0.5687 (0.5833) loss 8.8246 (7.5739) grad_norm 2.1129 (2.2313) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 02:59:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][270/625] eta 0:03:27 lr 0.000818 wd 0.0500 time 0.5723 (0.5851) data time 0.0006 (0.0024) model time 0.5716 (0.5836) loss 6.3849 (7.5592) grad_norm 1.6337 (2.2237) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:00:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][280/625] eta 0:03:21 lr 0.000818 wd 0.0500 time 0.6986 (0.5853) data time 0.0008 (0.0023) model time 0.6977 (0.5839) loss 9.7796 (7.5628) grad_norm 1.7776 (2.2409) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:00:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][290/625] eta 0:03:16 lr 0.000818 wd 0.0500 time 0.5684 (0.5859) data time 0.0006 (0.0023) model time 0.5678 (0.5846) loss 8.2425 (7.5524) grad_norm 1.7437 (2.2348) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:00:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][300/625] eta 0:03:10 lr 0.000818 wd 0.0500 time 0.7172 (0.5874) data time 0.0007 (0.0022) model time 0.7165 (0.5865) loss 6.8488 (7.5369) grad_norm 1.7122 (2.2243) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:00:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][310/625] eta 0:03:05 lr 0.000818 wd 0.0500 time 0.7193 (0.5885) data time 0.0006 (0.0022) model time 0.7187 (0.5877) loss 7.1047 (7.5484) grad_norm 1.6937 (2.2186) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:00:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][320/625] eta 0:02:59 lr 0.000818 wd 0.0500 time 0.5740 (0.5886) data time 0.0008 (0.0021) model time 0.5732 (0.5879) loss 7.4310 (7.5450) grad_norm 1.6012 (2.2110) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:00:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][330/625] eta 0:02:53 lr 0.000818 wd 0.0500 time 0.5732 (0.5882) data time 0.0006 (0.0021) model time 0.5726 (0.5874) loss 6.5641 (7.5559) grad_norm 1.6862 (2.2066) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:00:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][340/625] eta 0:02:47 lr 0.000818 wd 0.0500 time 0.5710 (0.5879) data time 0.0007 (0.0020) model time 0.5703 (0.5870) loss 6.5742 (7.5598) grad_norm 1.7062 (2.2006) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:00:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][350/625] eta 0:02:41 lr 0.000817 wd 0.0500 time 0.5639 (0.5875) data time 0.0006 (0.0020) model time 0.5633 (0.5866) loss 7.4901 (7.5681) grad_norm 2.1375 (2.1949) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:00:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][360/625] eta 0:02:35 lr 0.000817 wd 0.0500 time 0.5678 (0.5871) data time 0.0010 (0.0020) model time 0.5668 (0.5861) loss 5.9146 (7.5664) grad_norm 2.7241 (2.1913) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:00:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][370/625] eta 0:02:29 lr 0.000817 wd 0.0500 time 0.5740 (0.5867) data time 0.0006 (0.0019) model time 0.5733 (0.5857) loss 9.0815 (7.5584) grad_norm 1.9572 (2.1873) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:00:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][380/625] eta 0:02:23 lr 0.000817 wd 0.0500 time 0.5731 (0.5864) data time 0.0006 (0.0019) model time 0.5725 (0.5854) loss 7.1677 (7.5523) grad_norm 2.2424 (2.1807) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:01:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][390/625] eta 0:02:17 lr 0.000817 wd 0.0500 time 0.5683 (0.5863) data time 0.0008 (0.0019) model time 0.5675 (0.5852) loss 6.7361 (7.5552) grad_norm 2.9534 (2.1810) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:01:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][400/625] eta 0:02:11 lr 0.000817 wd 0.0500 time 0.5746 (0.5860) data time 0.0007 (0.0019) model time 0.5739 (0.5849) loss 6.6700 (7.5495) grad_norm 3.1208 (2.1998) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:01:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][410/625] eta 0:02:05 lr 0.000817 wd 0.0500 time 0.5683 (0.5858) data time 0.0008 (0.0018) model time 0.5675 (0.5847) loss 6.1797 (7.5533) grad_norm 1.6301 (2.1946) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:01:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][420/625] eta 0:02:00 lr 0.000817 wd 0.0500 time 0.5731 (0.5855) data time 0.0008 (0.0018) model time 0.5724 (0.5844) loss 6.8466 (7.5517) grad_norm 1.7060 (2.1951) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:01:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][430/625] eta 0:01:54 lr 0.000817 wd 0.0500 time 0.5722 (0.5853) data time 0.0006 (0.0018) model time 0.5716 (0.5841) loss 8.6559 (7.5597) grad_norm 2.8948 (2.2009) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:01:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][440/625] eta 0:01:48 lr 0.000817 wd 0.0500 time 0.5724 (0.5851) data time 0.0008 (0.0018) model time 0.5717 (0.5839) loss 8.2778 (7.5659) grad_norm 1.4317 (2.1993) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:01:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][450/625] eta 0:01:42 lr 0.000816 wd 0.0500 time 0.5743 (0.5849) data time 0.0006 (0.0017) model time 0.5737 (0.5837) loss 7.0641 (7.5649) grad_norm 1.4784 (2.1943) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:01:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][460/625] eta 0:01:36 lr 0.000816 wd 0.0500 time 0.5726 (0.5847) data time 0.0006 (0.0017) model time 0.5720 (0.5835) loss 6.5502 (7.5670) grad_norm 1.7879 (2.1901) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:01:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][470/625] eta 0:01:30 lr 0.000816 wd 0.0500 time 0.5716 (0.5845) data time 0.0006 (0.0017) model time 0.5710 (0.5833) loss 7.3097 (7.5760) grad_norm 2.1266 (2.1980) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:01:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][480/625] eta 0:01:24 lr 0.000816 wd 0.0500 time 0.5721 (0.5844) data time 0.0008 (0.0017) model time 0.5713 (0.5831) loss 6.1269 (7.5803) grad_norm 1.7055 (2.2018) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:02:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][490/625] eta 0:01:18 lr 0.000816 wd 0.0500 time 0.5732 (0.5843) data time 0.0006 (0.0017) model time 0.5726 (0.5830) loss 6.1791 (7.5879) grad_norm 2.1748 (2.1970) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:02:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][500/625] eta 0:01:13 lr 0.000816 wd 0.0500 time 0.5628 (0.5841) data time 0.0007 (0.0016) model time 0.5621 (0.5829) loss 6.4873 (7.5896) grad_norm 1.6683 (2.1995) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:02:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][510/625] eta 0:01:07 lr 0.000816 wd 0.0500 time 0.7170 (0.5849) data time 0.0008 (0.0016) model time 0.7162 (0.5838) loss 6.9104 (7.5861) grad_norm 1.7067 (2.1986) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:02:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][520/625] eta 0:01:01 lr 0.000816 wd 0.0500 time 0.7748 (0.5857) data time 0.0008 (0.0016) model time 0.7740 (0.5847) loss 7.2892 (7.5860) grad_norm 1.9246 (2.1987) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:02:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][530/625] eta 0:00:55 lr 0.000816 wd 0.0500 time 0.5623 (0.5867) data time 0.0006 (0.0016) model time 0.5617 (0.5857) loss 5.8339 (7.5838) grad_norm 2.0050 (2.1997) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:02:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][540/625] eta 0:00:49 lr 0.000816 wd 0.0500 time 0.5725 (0.5869) data time 0.0008 (0.0016) model time 0.5717 (0.5860) loss 8.8973 (7.5919) grad_norm 3.0413 (2.2120) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:02:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][550/625] eta 0:00:44 lr 0.000815 wd 0.0500 time 0.5705 (0.5867) data time 0.0006 (0.0016) model time 0.5699 (0.5857) loss 8.1012 (7.5957) grad_norm 1.6301 (2.2141) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:02:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][560/625] eta 0:00:38 lr 0.000815 wd 0.0500 time 0.5742 (0.5865) data time 0.0006 (0.0015) model time 0.5736 (0.5856) loss 8.8062 (7.5901) grad_norm 3.1743 (2.2196) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:02:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][570/625] eta 0:00:32 lr 0.000815 wd 0.0500 time 0.5753 (0.5863) data time 0.0008 (0.0015) model time 0.5745 (0.5854) loss 6.2803 (7.5897) grad_norm 1.8579 (2.2185) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:02:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][580/625] eta 0:00:26 lr 0.000815 wd 0.0500 time 0.5715 (0.5861) data time 0.0008 (0.0015) model time 0.5708 (0.5851) loss 7.5608 (7.5869) grad_norm 1.8742 (2.2187) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:03:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][590/625] eta 0:00:20 lr 0.000815 wd 0.0500 time 0.5721 (0.5859) data time 0.0006 (0.0015) model time 0.5714 (0.5849) loss 8.2870 (7.5927) grad_norm 3.1430 (2.2177) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:03:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][600/625] eta 0:00:14 lr 0.000815 wd 0.0500 time 0.5737 (0.5857) data time 0.0006 (0.0015) model time 0.5731 (0.5847) loss 7.5749 (7.5982) grad_norm 1.7313 (2.2137) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:03:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][610/625] eta 0:00:08 lr 0.000815 wd 0.0500 time 0.5649 (0.5856) data time 0.0006 (0.0015) model time 0.5643 (0.5845) loss 8.4508 (7.5941) grad_norm 2.4531 (2.2131) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:03:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [127/300][620/625] eta 0:00:02 lr 0.000815 wd 0.0500 time 0.5718 (0.5854) data time 0.0006 (0.0015) model time 0.5712 (0.5843) loss 8.2678 (7.5917) grad_norm 2.4198 (2.2134) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:03:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 127 training takes 0:06:05 +[2024-07-25 03:03:21 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 03:03:23 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 03:03:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.481 (0.481) Loss 0.5405 (0.5405) Acc@1 88.477 (88.477) Acc@5 98.730 (98.730) Mem 22339MB +[2024-07-25 03:03:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.8472 (0.6789) Acc@1 80.225 (85.365) Acc@5 96.191 (97.523) Mem 22339MB +[2024-07-25 03:03:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.143) Loss 0.9946 (0.7989) Acc@1 75.684 (82.034) Acc@5 94.385 (96.222) Mem 22339MB +[2024-07-25 03:03:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.786 Acc@5 96.207 +[2024-07-25 03:03:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 81.8% +[2024-07-25 03:03:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.942 (0.942) Loss 0.5015 (0.5015) Acc@1 89.111 (89.111) Acc@5 98.535 (98.535) Mem 22339MB +[2024-07-25 03:03:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.205) Loss 0.7998 (0.6321) Acc@1 80.957 (86.319) Acc@5 96.240 (97.692) Mem 22339MB +[2024-07-25 03:03:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.167) Loss 0.9170 (0.7416) Acc@1 77.295 (83.017) Acc@5 95.410 (96.582) Mem 22339MB +[2024-07-25 03:03:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.688 Acc@5 96.587 +[2024-07-25 03:03:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.7% +[2024-07-25 03:03:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.69% +[2024-07-25 03:03:30 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 03:03:31 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 03:03:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][0/625] eta 0:09:40 lr 0.000815 wd 0.0500 time 0.9289 (0.9289) data time 0.4106 (0.4106) model time 0.0000 (0.0000) loss 7.4749 (7.4749) grad_norm 2.5432 (2.5432) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:03:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][10/625] eta 0:06:12 lr 0.000815 wd 0.0500 time 0.5714 (0.6061) data time 0.0006 (0.0381) model time 0.0000 (0.0000) loss 7.6074 (7.7357) grad_norm 2.1892 (2.8571) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:03:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][20/625] eta 0:05:59 lr 0.000815 wd 0.0500 time 0.5741 (0.5948) data time 0.0006 (0.0203) model time 0.0000 (0.0000) loss 8.9896 (7.8116) grad_norm 1.7444 (2.8766) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:03:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][30/625] eta 0:05:50 lr 0.000814 wd 0.0500 time 0.5668 (0.5882) data time 0.0006 (0.0140) model time 0.0000 (0.0000) loss 6.6351 (7.5394) grad_norm 3.6764 (2.6789) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:03:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][40/625] eta 0:05:42 lr 0.000814 wd 0.0500 time 0.5708 (0.5848) data time 0.0006 (0.0108) model time 0.0000 (0.0000) loss 6.0110 (7.4292) grad_norm 2.2099 (2.7350) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:04:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][50/625] eta 0:05:35 lr 0.000814 wd 0.0500 time 0.5722 (0.5829) data time 0.0008 (0.0088) model time 0.0000 (0.0000) loss 8.7251 (7.4471) grad_norm 1.7333 (2.6093) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:04:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][60/625] eta 0:05:28 lr 0.000814 wd 0.0500 time 0.5668 (0.5816) data time 0.0005 (0.0075) model time 0.5663 (0.5737) loss 8.7278 (7.4509) grad_norm 1.6711 (2.5417) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:04:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][70/625] eta 0:05:22 lr 0.000814 wd 0.0500 time 0.5723 (0.5805) data time 0.0006 (0.0066) model time 0.5717 (0.5733) loss 7.0703 (7.4418) grad_norm 3.0533 (2.5052) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:04:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][80/625] eta 0:05:15 lr 0.000814 wd 0.0500 time 0.5718 (0.5798) data time 0.0006 (0.0059) model time 0.5712 (0.5737) loss 8.7423 (7.5015) grad_norm 1.4841 (2.4506) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:04:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][90/625] eta 0:05:09 lr 0.000814 wd 0.0500 time 0.5602 (0.5793) data time 0.0008 (0.0053) model time 0.5594 (0.5738) loss 7.1955 (7.5155) grad_norm 1.5944 (2.3843) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:04:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][100/625] eta 0:05:05 lr 0.000814 wd 0.0500 time 0.7308 (0.5824) data time 0.0008 (0.0049) model time 0.7300 (0.5809) loss 7.6497 (7.5124) grad_norm 1.8730 (2.3451) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:04:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][110/625] eta 0:05:02 lr 0.000814 wd 0.0500 time 0.5748 (0.5866) data time 0.0008 (0.0045) model time 0.5740 (0.5888) loss 7.1157 (7.5152) grad_norm 2.5998 (2.3188) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:04:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][120/625] eta 0:04:58 lr 0.000814 wd 0.0500 time 0.5706 (0.5902) data time 0.0006 (0.0042) model time 0.5699 (0.5946) loss 8.1886 (7.5586) grad_norm 2.1632 (2.3028) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:04:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][130/625] eta 0:04:53 lr 0.000813 wd 0.0500 time 0.5693 (0.5933) data time 0.0006 (0.0039) model time 0.5687 (0.5990) loss 7.0006 (7.5332) grad_norm 3.4421 (2.3160) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:04:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][140/625] eta 0:04:47 lr 0.000813 wd 0.0500 time 0.5811 (0.5930) data time 0.0008 (0.0037) model time 0.5802 (0.5979) loss 9.0501 (7.5588) grad_norm 2.9012 (2.3379) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:05:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][150/625] eta 0:04:41 lr 0.000813 wd 0.0500 time 0.5671 (0.5919) data time 0.0008 (0.0035) model time 0.5662 (0.5957) loss 7.2856 (7.5554) grad_norm 1.9548 (2.3143) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:05:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][160/625] eta 0:04:34 lr 0.000813 wd 0.0500 time 0.5712 (0.5910) data time 0.0007 (0.0034) model time 0.5704 (0.5939) loss 6.0897 (7.5511) grad_norm 2.3170 (2.2926) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:05:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][170/625] eta 0:04:28 lr 0.000813 wd 0.0500 time 0.5754 (0.5901) data time 0.0008 (0.0032) model time 0.5746 (0.5923) loss 7.1113 (7.5566) grad_norm 1.9847 (2.2772) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:05:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][180/625] eta 0:04:22 lr 0.000813 wd 0.0500 time 0.5662 (0.5895) data time 0.0006 (0.0031) model time 0.5656 (0.5912) loss 8.8565 (7.5649) grad_norm 2.5832 (2.2785) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:05:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][190/625] eta 0:04:16 lr 0.000813 wd 0.0500 time 0.5599 (0.5889) data time 0.0006 (0.0031) model time 0.5593 (0.5901) loss 7.3588 (7.5642) grad_norm 2.3787 (2.2786) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:05:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][200/625] eta 0:04:09 lr 0.000813 wd 0.0500 time 0.5669 (0.5882) data time 0.0006 (0.0030) model time 0.5662 (0.5890) loss 7.7826 (7.5764) grad_norm 1.6781 (2.2709) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:05:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][210/625] eta 0:04:03 lr 0.000813 wd 0.0500 time 0.5646 (0.5878) data time 0.0006 (0.0029) model time 0.5640 (0.5884) loss 8.3957 (7.5775) grad_norm 1.7907 (2.2624) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:05:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][220/625] eta 0:03:57 lr 0.000813 wd 0.0500 time 0.5628 (0.5874) data time 0.0006 (0.0028) model time 0.5622 (0.5877) loss 7.6126 (7.5647) grad_norm 2.0926 (2.2651) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:05:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][230/625] eta 0:03:51 lr 0.000812 wd 0.0500 time 0.5645 (0.5873) data time 0.0006 (0.0027) model time 0.5639 (0.5875) loss 8.1013 (7.5672) grad_norm 2.1134 (2.2978) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:05:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][240/625] eta 0:03:46 lr 0.000812 wd 0.0500 time 0.5745 (0.5877) data time 0.0007 (0.0027) model time 0.5737 (0.5879) loss 6.0728 (7.5636) grad_norm 2.0363 (2.3117) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:05:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][250/625] eta 0:03:40 lr 0.000812 wd 0.0500 time 0.5611 (0.5873) data time 0.0006 (0.0026) model time 0.5605 (0.5874) loss 8.8676 (7.5612) grad_norm 2.8810 (2.3147) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:06:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][260/625] eta 0:03:34 lr 0.000812 wd 0.0500 time 0.5709 (0.5868) data time 0.0008 (0.0025) model time 0.5701 (0.5867) loss 9.3771 (7.5864) grad_norm 1.9135 (2.3030) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:06:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][270/625] eta 0:03:28 lr 0.000812 wd 0.0500 time 0.5738 (0.5864) data time 0.0006 (0.0025) model time 0.5732 (0.5862) loss 6.3443 (7.5809) grad_norm 2.0938 (2.3216) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:06:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][280/625] eta 0:03:22 lr 0.000812 wd 0.0500 time 0.5630 (0.5862) data time 0.0008 (0.0024) model time 0.5622 (0.5859) loss 7.1140 (7.5772) grad_norm 3.2594 (2.3399) loss_scale 4096.0000 (2091.7295) mem 22339MB +[2024-07-25 03:06:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][290/625] eta 0:03:16 lr 0.000812 wd 0.0500 time 0.5733 (0.5863) data time 0.0006 (0.0024) model time 0.5727 (0.5860) loss 6.9326 (7.5603) grad_norm 1.5630 (2.3361) loss_scale 4096.0000 (2160.6048) mem 22339MB +[2024-07-25 03:06:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][300/625] eta 0:03:10 lr 0.000812 wd 0.0500 time 0.5719 (0.5861) data time 0.0007 (0.0023) model time 0.5713 (0.5857) loss 7.2326 (7.5719) grad_norm 2.0231 (2.3353) loss_scale 4096.0000 (2224.9037) mem 22339MB +[2024-07-25 03:06:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][310/625] eta 0:03:04 lr 0.000812 wd 0.0500 time 0.5683 (0.5858) data time 0.0008 (0.0023) model time 0.5675 (0.5853) loss 6.7452 (7.5762) grad_norm 2.0458 (2.3300) loss_scale 4096.0000 (2285.0675) mem 22339MB +[2024-07-25 03:06:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][320/625] eta 0:02:58 lr 0.000812 wd 0.0500 time 0.6500 (0.5858) data time 0.0008 (0.0022) model time 0.6492 (0.5853) loss 9.8701 (7.5850) grad_norm 1.9274 (2.3161) loss_scale 4096.0000 (2341.4829) mem 22339MB +[2024-07-25 03:06:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][330/625] eta 0:02:53 lr 0.000811 wd 0.0500 time 0.7402 (0.5877) data time 0.0006 (0.0022) model time 0.7396 (0.5876) loss 7.2127 (7.5816) grad_norm 2.0483 (2.3125) loss_scale 4096.0000 (2394.4894) mem 22339MB +[2024-07-25 03:06:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][340/625] eta 0:02:47 lr 0.000811 wd 0.0500 time 0.7071 (0.5887) data time 0.0006 (0.0021) model time 0.7065 (0.5888) loss 7.4436 (7.5837) grad_norm 2.4531 (2.3081) loss_scale 4096.0000 (2444.3871) mem 22339MB +[2024-07-25 03:06:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][350/625] eta 0:02:42 lr 0.000811 wd 0.0500 time 0.6184 (0.5895) data time 0.0006 (0.0021) model time 0.6178 (0.5896) loss 7.0975 (7.5906) grad_norm 2.1576 (2.2981) loss_scale 4096.0000 (2491.4416) mem 22339MB +[2024-07-25 03:07:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][360/625] eta 0:02:36 lr 0.000811 wd 0.0500 time 0.5651 (0.5892) data time 0.0009 (0.0021) model time 0.5642 (0.5893) loss 7.8203 (7.5951) grad_norm 1.6525 (2.2947) loss_scale 4096.0000 (2535.8892) mem 22339MB +[2024-07-25 03:07:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][370/625] eta 0:02:30 lr 0.000811 wd 0.0500 time 0.5636 (0.5888) data time 0.0008 (0.0020) model time 0.5627 (0.5888) loss 8.2382 (7.5918) grad_norm 1.8008 (2.2907) loss_scale 4096.0000 (2577.9407) mem 22339MB +[2024-07-25 03:07:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][380/625] eta 0:02:24 lr 0.000811 wd 0.0500 time 0.5731 (0.5885) data time 0.0010 (0.0020) model time 0.5721 (0.5884) loss 7.5175 (7.5831) grad_norm 1.9093 (2.2803) loss_scale 4096.0000 (2617.7848) mem 22339MB +[2024-07-25 03:07:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][390/625] eta 0:02:18 lr 0.000811 wd 0.0500 time 0.5693 (0.5882) data time 0.0007 (0.0020) model time 0.5686 (0.5880) loss 7.2221 (7.5853) grad_norm 3.1118 (2.2875) loss_scale 4096.0000 (2655.5908) mem 22339MB +[2024-07-25 03:07:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][400/625] eta 0:02:12 lr 0.000811 wd 0.0500 time 0.5665 (0.5879) data time 0.0009 (0.0019) model time 0.5656 (0.5877) loss 6.6577 (7.5912) grad_norm 2.0441 (2.2904) loss_scale 4096.0000 (2691.5112) mem 22339MB +[2024-07-25 03:07:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][410/625] eta 0:02:06 lr 0.000811 wd 0.0500 time 0.5722 (0.5876) data time 0.0007 (0.0019) model time 0.5716 (0.5873) loss 7.4297 (7.6040) grad_norm 2.3528 (2.2862) loss_scale 4096.0000 (2725.6837) mem 22339MB +[2024-07-25 03:07:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][420/625] eta 0:02:00 lr 0.000811 wd 0.0500 time 0.5733 (0.5873) data time 0.0008 (0.0019) model time 0.5725 (0.5870) loss 6.0179 (7.6033) grad_norm 2.0230 (2.2922) loss_scale 4096.0000 (2758.2328) mem 22339MB +[2024-07-25 03:07:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][430/625] eta 0:01:54 lr 0.000810 wd 0.0500 time 0.5693 (0.5871) data time 0.0008 (0.0019) model time 0.5684 (0.5867) loss 8.4263 (7.6055) grad_norm 2.1848 (2.2853) loss_scale 4096.0000 (2789.2715) mem 22339MB +[2024-07-25 03:07:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][440/625] eta 0:01:48 lr 0.000810 wd 0.0500 time 0.5698 (0.5868) data time 0.0006 (0.0018) model time 0.5692 (0.5864) loss 7.5057 (7.6029) grad_norm 1.5871 (2.2787) loss_scale 4096.0000 (2818.9025) mem 22339MB +[2024-07-25 03:07:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][450/625] eta 0:01:42 lr 0.000810 wd 0.0500 time 0.5731 (0.5866) data time 0.0007 (0.0018) model time 0.5724 (0.5862) loss 8.1685 (7.6146) grad_norm 2.2897 (2.2752) loss_scale 4096.0000 (2847.2195) mem 22339MB +[2024-07-25 03:08:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][460/625] eta 0:01:36 lr 0.000810 wd 0.0500 time 0.5688 (0.5868) data time 0.0006 (0.0018) model time 0.5681 (0.5863) loss 7.3421 (7.6109) grad_norm 2.9527 (2.2759) loss_scale 4096.0000 (2874.3080) mem 22339MB +[2024-07-25 03:08:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][470/625] eta 0:01:30 lr 0.000810 wd 0.0500 time 0.5715 (0.5865) data time 0.0008 (0.0018) model time 0.5707 (0.5861) loss 6.6816 (7.6078) grad_norm 2.5599 (2.2777) loss_scale 4096.0000 (2900.2463) mem 22339MB +[2024-07-25 03:08:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][480/625] eta 0:01:25 lr 0.000810 wd 0.0500 time 0.5701 (0.5863) data time 0.0009 (0.0018) model time 0.5692 (0.5858) loss 7.7603 (7.6086) grad_norm 2.1690 (2.2725) loss_scale 4096.0000 (2925.1060) mem 22339MB +[2024-07-25 03:08:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][490/625] eta 0:01:19 lr 0.000810 wd 0.0500 time 0.5738 (0.5861) data time 0.0007 (0.0017) model time 0.5731 (0.5855) loss 6.9929 (7.5977) grad_norm 3.3627 (2.2776) loss_scale 4096.0000 (2948.9532) mem 22339MB +[2024-07-25 03:08:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][500/625] eta 0:01:13 lr 0.000810 wd 0.0500 time 0.5733 (0.5859) data time 0.0008 (0.0017) model time 0.5724 (0.5853) loss 6.2812 (7.5948) grad_norm 1.5856 (2.2776) loss_scale 4096.0000 (2971.8483) mem 22339MB +[2024-07-25 03:08:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][510/625] eta 0:01:07 lr 0.000810 wd 0.0500 time 0.5633 (0.5857) data time 0.0008 (0.0017) model time 0.5624 (0.5851) loss 5.4371 (7.5918) grad_norm 1.9689 (2.2859) loss_scale 4096.0000 (2993.8474) mem 22339MB +[2024-07-25 03:08:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][520/625] eta 0:01:01 lr 0.000810 wd 0.0500 time 0.5740 (0.5856) data time 0.0008 (0.0017) model time 0.5731 (0.5849) loss 7.0339 (7.5981) grad_norm 1.8679 (2.2824) loss_scale 4096.0000 (3015.0019) mem 22339MB +[2024-07-25 03:08:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][530/625] eta 0:00:55 lr 0.000809 wd 0.0500 time 0.5608 (0.5854) data time 0.0008 (0.0017) model time 0.5601 (0.5847) loss 6.6723 (7.5999) grad_norm 1.7292 (2.2737) loss_scale 4096.0000 (3035.3597) mem 22339MB +[2024-07-25 03:08:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][540/625] eta 0:00:49 lr 0.000809 wd 0.0500 time 0.7031 (0.5855) data time 0.0006 (0.0017) model time 0.7025 (0.5849) loss 8.1007 (7.5905) grad_norm 2.9306 (2.2733) loss_scale 4096.0000 (3054.9649) mem 22339MB +[2024-07-25 03:08:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][550/625] eta 0:00:43 lr 0.000809 wd 0.0500 time 0.7506 (0.5863) data time 0.0006 (0.0017) model time 0.7500 (0.5858) loss 9.6807 (7.5911) grad_norm 1.6504 (2.2777) loss_scale 4096.0000 (3073.8584) mem 22339MB +[2024-07-25 03:09:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][560/625] eta 0:00:38 lr 0.000809 wd 0.0500 time 0.7346 (0.5873) data time 0.0008 (0.0016) model time 0.7337 (0.5868) loss 8.3659 (7.5987) grad_norm 1.9485 (2.2747) loss_scale 4096.0000 (3092.0784) mem 22339MB +[2024-07-25 03:09:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][570/625] eta 0:00:32 lr 0.000809 wd 0.0500 time 0.7438 (0.5878) data time 0.0008 (0.0016) model time 0.7429 (0.5873) loss 8.4462 (7.5985) grad_norm 2.0311 (2.2733) loss_scale 4096.0000 (3109.6602) mem 22339MB +[2024-07-25 03:09:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][580/625] eta 0:00:26 lr 0.000809 wd 0.0500 time 0.5732 (0.5876) data time 0.0006 (0.0016) model time 0.5725 (0.5871) loss 8.7460 (7.5996) grad_norm 1.7312 (2.2758) loss_scale 4096.0000 (3126.6368) mem 22339MB +[2024-07-25 03:09:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][590/625] eta 0:00:20 lr 0.000809 wd 0.0500 time 0.5727 (0.5874) data time 0.0007 (0.0016) model time 0.5720 (0.5869) loss 7.3590 (7.6032) grad_norm 2.6625 (2.2797) loss_scale 4096.0000 (3143.0389) mem 22339MB +[2024-07-25 03:09:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][600/625] eta 0:00:14 lr 0.000809 wd 0.0500 time 0.5820 (0.5872) data time 0.0008 (0.0016) model time 0.5811 (0.5867) loss 7.9927 (7.6046) grad_norm 2.1060 (2.2800) loss_scale 4096.0000 (3158.8952) mem 22339MB +[2024-07-25 03:09:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][610/625] eta 0:00:08 lr 0.000809 wd 0.0500 time 0.5724 (0.5870) data time 0.0004 (0.0016) model time 0.5720 (0.5865) loss 8.4781 (7.6012) grad_norm 2.3791 (2.2737) loss_scale 4096.0000 (3174.2324) mem 22339MB +[2024-07-25 03:09:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [128/300][620/625] eta 0:00:02 lr 0.000809 wd 0.0500 time 0.5637 (0.5869) data time 0.0004 (0.0016) model time 0.5633 (0.5863) loss 7.1546 (7.6008) grad_norm 1.8583 (2.2707) loss_scale 4096.0000 (3189.0757) mem 22339MB +[2024-07-25 03:09:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 128 training takes 0:06:06 +[2024-07-25 03:09:38 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 03:09:40 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 03:09:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.523 (0.523) Loss 0.5205 (0.5205) Acc@1 89.062 (89.062) Acc@5 98.682 (98.682) Mem 22339MB +[2024-07-25 03:09:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.162) Loss 0.8672 (0.6653) Acc@1 80.322 (85.760) Acc@5 95.605 (97.496) Mem 22339MB +[2024-07-25 03:09:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.144) Loss 0.9771 (0.7807) Acc@1 75.342 (82.482) Acc@5 94.580 (96.277) Mem 22339MB +[2024-07-25 03:09:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.176 Acc@5 96.251 +[2024-07-25 03:09:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.2% +[2024-07-25 03:09:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 82.18% +[2024-07-25 03:09:43 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 03:09:45 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 03:09:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.462 (0.462) Loss 0.5010 (0.5010) Acc@1 89.062 (89.062) Acc@5 98.535 (98.535) Mem 22339MB +[2024-07-25 03:09:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.156) Loss 0.7983 (0.6316) Acc@1 81.006 (86.301) Acc@5 96.191 (97.701) Mem 22339MB +[2024-07-25 03:09:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.141) Loss 0.9155 (0.7410) Acc@1 77.295 (83.012) Acc@5 95.508 (96.608) Mem 22339MB +[2024-07-25 03:09:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.700 Acc@5 96.607 +[2024-07-25 03:09:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.7% +[2024-07-25 03:09:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.70% +[2024-07-25 03:09:48 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 03:09:50 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 03:09:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][0/625] eta 0:09:14 lr 0.000808 wd 0.0500 time 0.8868 (0.8868) data time 0.3698 (0.3698) model time 0.0000 (0.0000) loss 8.2157 (8.2157) grad_norm 1.5865 (1.5865) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:09:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][10/625] eta 0:06:10 lr 0.000808 wd 0.0500 time 0.5733 (0.6027) data time 0.0006 (0.0343) model time 0.0000 (0.0000) loss 7.0869 (7.1693) grad_norm 2.2269 (2.3223) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:10:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][20/625] eta 0:05:56 lr 0.000808 wd 0.0500 time 0.5697 (0.5888) data time 0.0006 (0.0183) model time 0.0000 (0.0000) loss 7.0815 (7.2694) grad_norm 1.9475 (2.2626) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:10:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][30/625] eta 0:05:47 lr 0.000808 wd 0.0500 time 0.5733 (0.5844) data time 0.0006 (0.0126) model time 0.0000 (0.0000) loss 6.8391 (7.3192) grad_norm 1.9571 (2.2393) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:10:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][40/625] eta 0:05:40 lr 0.000808 wd 0.0500 time 0.5669 (0.5821) data time 0.0008 (0.0097) model time 0.0000 (0.0000) loss 8.4563 (7.4430) grad_norm 1.8238 (2.2204) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:10:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][50/625] eta 0:05:34 lr 0.000808 wd 0.0500 time 0.5752 (0.5812) data time 0.0006 (0.0080) model time 0.0000 (0.0000) loss 7.5971 (7.4019) grad_norm 1.6470 (2.1357) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:10:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][60/625] eta 0:05:27 lr 0.000808 wd 0.0500 time 0.5693 (0.5801) data time 0.0006 (0.0068) model time 0.5687 (0.5735) loss 6.8058 (7.4044) grad_norm 2.2499 (2.1321) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:10:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][70/625] eta 0:05:21 lr 0.000808 wd 0.0500 time 0.5743 (0.5792) data time 0.0008 (0.0060) model time 0.5736 (0.5733) loss 6.1830 (7.3731) grad_norm 1.7085 (2.1453) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:10:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][80/625] eta 0:05:15 lr 0.000808 wd 0.0500 time 0.5625 (0.5788) data time 0.0006 (0.0053) model time 0.5619 (0.5738) loss 7.2704 (7.3320) grad_norm 1.6743 (2.1222) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:10:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][90/625] eta 0:05:09 lr 0.000808 wd 0.0500 time 0.5721 (0.5784) data time 0.0008 (0.0048) model time 0.5712 (0.5739) loss 7.2319 (7.3544) grad_norm 3.3948 (2.1597) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:10:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][100/625] eta 0:05:03 lr 0.000807 wd 0.0500 time 0.5719 (0.5779) data time 0.0008 (0.0044) model time 0.5710 (0.5737) loss 7.4775 (7.3999) grad_norm 1.7218 (2.1424) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:10:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][110/625] eta 0:04:57 lr 0.000807 wd 0.0500 time 0.5735 (0.5777) data time 0.0008 (0.0041) model time 0.5726 (0.5738) loss 5.7792 (7.4317) grad_norm 2.0590 (2.1273) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:11:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][120/625] eta 0:04:51 lr 0.000807 wd 0.0500 time 0.5737 (0.5775) data time 0.0006 (0.0038) model time 0.5730 (0.5741) loss 7.0345 (7.4276) grad_norm 1.8457 (2.1097) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:11:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][130/625] eta 0:04:45 lr 0.000807 wd 0.0500 time 0.5751 (0.5775) data time 0.0008 (0.0036) model time 0.5742 (0.5743) loss 6.6503 (7.4452) grad_norm 2.2844 (2.1124) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:11:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][140/625] eta 0:04:41 lr 0.000807 wd 0.0500 time 0.5686 (0.5808) data time 0.0008 (0.0034) model time 0.5678 (0.5798) loss 9.5921 (7.4632) grad_norm 3.7882 (2.1708) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:11:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][150/625] eta 0:04:37 lr 0.000807 wd 0.0500 time 0.6330 (0.5838) data time 0.0008 (0.0032) model time 0.6321 (0.5842) loss 8.0436 (7.4815) grad_norm 1.8612 (2.1717) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:11:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][160/625] eta 0:04:32 lr 0.000807 wd 0.0500 time 0.5737 (0.5869) data time 0.0008 (0.0031) model time 0.5729 (0.5887) loss 9.3844 (7.5191) grad_norm 3.1186 (2.2082) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:11:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][170/625] eta 0:04:28 lr 0.000807 wd 0.0500 time 0.5709 (0.5896) data time 0.0006 (0.0030) model time 0.5703 (0.5924) loss 7.8200 (7.5499) grad_norm 2.2133 (2.2304) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:11:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][180/625] eta 0:04:22 lr 0.000807 wd 0.0500 time 0.5723 (0.5889) data time 0.0009 (0.0028) model time 0.5714 (0.5910) loss 6.9476 (7.5588) grad_norm 2.5519 (2.2358) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:11:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][190/625] eta 0:04:16 lr 0.000807 wd 0.0500 time 0.5681 (0.5890) data time 0.0008 (0.0027) model time 0.5674 (0.5911) loss 6.3164 (7.5611) grad_norm 2.6027 (2.2367) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:11:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][200/625] eta 0:04:10 lr 0.000806 wd 0.0500 time 0.5630 (0.5884) data time 0.0008 (0.0026) model time 0.5622 (0.5901) loss 8.9463 (7.5820) grad_norm 2.4687 (2.2337) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:11:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][210/625] eta 0:04:03 lr 0.000806 wd 0.0500 time 0.5648 (0.5878) data time 0.0006 (0.0025) model time 0.5642 (0.5891) loss 9.4895 (7.5894) grad_norm 3.2240 (2.2397) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:12:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][220/625] eta 0:03:57 lr 0.000806 wd 0.0500 time 0.5695 (0.5873) data time 0.0008 (0.0025) model time 0.5687 (0.5883) loss 9.0895 (7.5946) grad_norm 2.5886 (2.2709) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:12:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][230/625] eta 0:03:51 lr 0.000806 wd 0.0500 time 0.5745 (0.5869) data time 0.0009 (0.0024) model time 0.5737 (0.5877) loss 7.2379 (7.5801) grad_norm 1.8775 (2.2553) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:12:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][240/625] eta 0:03:45 lr 0.000806 wd 0.0500 time 0.5724 (0.5865) data time 0.0010 (0.0023) model time 0.5715 (0.5870) loss 6.7609 (7.5678) grad_norm 2.5580 (2.2470) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:12:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][250/625] eta 0:03:39 lr 0.000806 wd 0.0500 time 0.5721 (0.5860) data time 0.0006 (0.0023) model time 0.5716 (0.5864) loss 6.8322 (7.5792) grad_norm 2.0186 (2.2316) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:12:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][260/625] eta 0:03:33 lr 0.000806 wd 0.0500 time 0.5716 (0.5856) data time 0.0008 (0.0022) model time 0.5708 (0.5858) loss 8.8971 (7.5849) grad_norm 3.4048 (2.2307) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:12:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][270/625] eta 0:03:27 lr 0.000806 wd 0.0500 time 0.5707 (0.5852) data time 0.0008 (0.0022) model time 0.5699 (0.5853) loss 7.5963 (7.5845) grad_norm 2.4183 (2.2316) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:12:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][280/625] eta 0:03:21 lr 0.000806 wd 0.0500 time 0.5615 (0.5848) data time 0.0007 (0.0021) model time 0.5608 (0.5848) loss 5.6144 (7.5781) grad_norm 1.9132 (2.2393) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:12:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][290/625] eta 0:03:15 lr 0.000806 wd 0.0500 time 0.5710 (0.5845) data time 0.0006 (0.0021) model time 0.5704 (0.5844) loss 7.8710 (7.5945) grad_norm 1.9524 (2.2406) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:12:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][300/625] eta 0:03:09 lr 0.000805 wd 0.0500 time 0.5621 (0.5842) data time 0.0008 (0.0020) model time 0.5613 (0.5840) loss 6.8729 (7.5941) grad_norm 2.4129 (2.2467) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:12:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][310/625] eta 0:03:03 lr 0.000805 wd 0.0500 time 0.5722 (0.5840) data time 0.0007 (0.0020) model time 0.5716 (0.5837) loss 5.7710 (7.5905) grad_norm 1.8757 (2.2523) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:12:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][320/625] eta 0:02:58 lr 0.000805 wd 0.0500 time 0.5745 (0.5837) data time 0.0006 (0.0020) model time 0.5738 (0.5834) loss 7.1826 (7.5769) grad_norm 1.7067 (2.2523) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:13:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][330/625] eta 0:02:52 lr 0.000805 wd 0.0500 time 0.5722 (0.5834) data time 0.0008 (0.0019) model time 0.5714 (0.5830) loss 6.8596 (7.5838) grad_norm 1.9939 (2.2484) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:13:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][340/625] eta 0:02:46 lr 0.000805 wd 0.0500 time 0.5691 (0.5832) data time 0.0008 (0.0019) model time 0.5683 (0.5827) loss 7.8639 (7.5808) grad_norm 2.9579 (2.2498) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:13:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][350/625] eta 0:02:40 lr 0.000805 wd 0.0500 time 0.5652 (0.5830) data time 0.0007 (0.0019) model time 0.5645 (0.5825) loss 7.8324 (7.5891) grad_norm 2.8861 (2.2629) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:13:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][360/625] eta 0:02:34 lr 0.000805 wd 0.0500 time 0.5661 (0.5835) data time 0.0006 (0.0018) model time 0.5655 (0.5830) loss 9.2292 (7.6034) grad_norm 1.8865 (2.2711) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:13:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][370/625] eta 0:02:29 lr 0.000805 wd 0.0500 time 0.6976 (0.5856) data time 0.0009 (0.0018) model time 0.6967 (0.5854) loss 8.6126 (7.6113) grad_norm 1.8848 (2.2648) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:13:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][380/625] eta 0:02:23 lr 0.000805 wd 0.0500 time 0.5993 (0.5869) data time 0.0008 (0.0018) model time 0.5985 (0.5870) loss 7.4857 (7.6183) grad_norm 2.5017 (2.2582) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:13:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][390/625] eta 0:02:18 lr 0.000805 wd 0.0500 time 0.5742 (0.5878) data time 0.0006 (0.0018) model time 0.5736 (0.5880) loss 7.8378 (7.6245) grad_norm 1.7855 (2.2505) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:13:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][400/625] eta 0:02:12 lr 0.000804 wd 0.0500 time 0.5697 (0.5876) data time 0.0009 (0.0018) model time 0.5688 (0.5877) loss 8.2451 (7.6310) grad_norm 1.6485 (2.2506) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:13:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][410/625] eta 0:02:06 lr 0.000804 wd 0.0500 time 0.5689 (0.5876) data time 0.0006 (0.0017) model time 0.5683 (0.5877) loss 7.5806 (7.6208) grad_norm 2.2400 (2.2435) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:13:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][420/625] eta 0:02:00 lr 0.000804 wd 0.0500 time 0.5699 (0.5873) data time 0.0006 (0.0017) model time 0.5694 (0.5873) loss 6.8286 (7.6070) grad_norm 2.0158 (2.2386) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:14:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][430/625] eta 0:01:54 lr 0.000804 wd 0.0500 time 0.5718 (0.5872) data time 0.0006 (0.0017) model time 0.5713 (0.5871) loss 6.4250 (7.6025) grad_norm 2.6609 (2.2421) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:14:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][440/625] eta 0:01:48 lr 0.000804 wd 0.0500 time 0.5748 (0.5870) data time 0.0008 (0.0017) model time 0.5740 (0.5869) loss 6.9366 (7.6037) grad_norm 2.3174 (2.2531) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:14:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][450/625] eta 0:01:42 lr 0.000804 wd 0.0500 time 0.5730 (0.5868) data time 0.0006 (0.0016) model time 0.5725 (0.5867) loss 6.7202 (7.6147) grad_norm 2.5261 (2.2574) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:14:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][460/625] eta 0:01:36 lr 0.000804 wd 0.0500 time 0.5720 (0.5866) data time 0.0007 (0.0016) model time 0.5713 (0.5864) loss 8.2061 (7.6070) grad_norm 2.0577 (2.2586) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:14:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][470/625] eta 0:01:30 lr 0.000804 wd 0.0500 time 0.5725 (0.5865) data time 0.0008 (0.0016) model time 0.5717 (0.5863) loss 9.0940 (7.6094) grad_norm 2.2349 (2.2551) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:14:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][480/625] eta 0:01:25 lr 0.000804 wd 0.0500 time 0.5680 (0.5863) data time 0.0008 (0.0016) model time 0.5672 (0.5861) loss 8.0650 (7.6081) grad_norm 1.6296 (2.2522) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:14:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][490/625] eta 0:01:19 lr 0.000804 wd 0.0500 time 0.5722 (0.5861) data time 0.0006 (0.0016) model time 0.5717 (0.5858) loss 6.6306 (7.6117) grad_norm 1.9814 (2.2499) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:14:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][500/625] eta 0:01:13 lr 0.000803 wd 0.0500 time 0.5700 (0.5859) data time 0.0006 (0.0016) model time 0.5694 (0.5856) loss 7.8210 (7.6135) grad_norm 2.9606 (2.2635) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:14:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][510/625] eta 0:01:07 lr 0.000803 wd 0.0500 time 0.5715 (0.5857) data time 0.0006 (0.0015) model time 0.5709 (0.5853) loss 8.4425 (7.6124) grad_norm 2.0271 (2.2722) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:14:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][520/625] eta 0:01:01 lr 0.000803 wd 0.0500 time 0.5746 (0.5855) data time 0.0006 (0.0015) model time 0.5740 (0.5851) loss 6.1405 (7.6142) grad_norm 2.1882 (2.2729) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:15:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][530/625] eta 0:00:55 lr 0.000803 wd 0.0500 time 0.5742 (0.5853) data time 0.0006 (0.0015) model time 0.5736 (0.5849) loss 6.4580 (7.6142) grad_norm 1.7968 (2.2706) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:15:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][540/625] eta 0:00:49 lr 0.000803 wd 0.0500 time 0.5726 (0.5851) data time 0.0007 (0.0015) model time 0.5718 (0.5847) loss 7.8388 (7.6116) grad_norm 1.9382 (2.2652) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:15:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][550/625] eta 0:00:43 lr 0.000803 wd 0.0500 time 0.5708 (0.5850) data time 0.0008 (0.0015) model time 0.5699 (0.5846) loss 8.6048 (7.6207) grad_norm 1.9791 (2.2638) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 03:15:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][560/625] eta 0:00:38 lr 0.000803 wd 0.0500 time 0.5720 (0.5848) data time 0.0006 (0.0015) model time 0.5714 (0.5843) loss 8.3120 (7.6197) grad_norm 1.9263 (inf) loss_scale 2048.0000 (4059.4938) mem 22339MB +[2024-07-25 03:15:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][570/625] eta 0:00:32 lr 0.000803 wd 0.0500 time 0.5678 (0.5846) data time 0.0006 (0.0015) model time 0.5673 (0.5841) loss 6.2069 (7.6253) grad_norm 1.5427 (inf) loss_scale 2048.0000 (4024.2662) mem 22339MB +[2024-07-25 03:15:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][580/625] eta 0:00:26 lr 0.000803 wd 0.0500 time 0.7040 (0.5853) data time 0.0006 (0.0015) model time 0.7034 (0.5848) loss 8.0672 (7.6290) grad_norm 2.1673 (inf) loss_scale 2048.0000 (3990.2513) mem 22339MB +[2024-07-25 03:15:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][590/625] eta 0:00:20 lr 0.000803 wd 0.0500 time 0.5723 (0.5862) data time 0.0008 (0.0014) model time 0.5715 (0.5859) loss 6.6798 (7.6243) grad_norm 2.5063 (inf) loss_scale 2048.0000 (3957.3875) mem 22339MB +[2024-07-25 03:15:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][600/625] eta 0:00:14 lr 0.000802 wd 0.0500 time 0.6793 (0.5869) data time 0.0006 (0.0014) model time 0.6787 (0.5865) loss 7.6973 (7.6333) grad_norm 2.1190 (inf) loss_scale 2048.0000 (3925.6173) mem 22339MB +[2024-07-25 03:15:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][610/625] eta 0:00:08 lr 0.000802 wd 0.0500 time 0.5708 (0.5870) data time 0.0005 (0.0014) model time 0.5703 (0.5867) loss 9.0398 (7.6332) grad_norm 1.7733 (inf) loss_scale 2048.0000 (3894.8871) mem 22339MB +[2024-07-25 03:15:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [129/300][620/625] eta 0:00:02 lr 0.000802 wd 0.0500 time 0.5714 (0.5868) data time 0.0006 (0.0014) model time 0.5709 (0.5865) loss 8.3902 (7.6314) grad_norm 1.7713 (inf) loss_scale 2048.0000 (3865.1465) mem 22339MB +[2024-07-25 03:15:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 129 training takes 0:06:06 +[2024-07-25 03:15:57 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 03:15:58 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 03:15:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.475 (0.475) Loss 0.5342 (0.5342) Acc@1 89.502 (89.502) Acc@5 98.242 (98.242) Mem 22339MB +[2024-07-25 03:16:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.8691 (0.6784) Acc@1 79.883 (85.498) Acc@5 95.850 (97.430) Mem 22339MB +[2024-07-25 03:16:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9648 (0.7927) Acc@1 76.221 (82.266) Acc@5 94.531 (96.240) Mem 22339MB +[2024-07-25 03:16:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 81.960 Acc@5 96.229 +[2024-07-25 03:16:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.0% +[2024-07-25 03:16:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.799 (0.799) Loss 0.5010 (0.5010) Acc@1 89.062 (89.062) Acc@5 98.535 (98.535) Mem 22339MB +[2024-07-25 03:16:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.188) Loss 0.7969 (0.6313) Acc@1 81.006 (86.328) Acc@5 96.191 (97.710) Mem 22339MB +[2024-07-25 03:16:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.158) Loss 0.9160 (0.7403) Acc@1 77.441 (83.059) Acc@5 95.557 (96.622) Mem 22339MB +[2024-07-25 03:16:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.740 Acc@5 96.619 +[2024-07-25 03:16:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.7% +[2024-07-25 03:16:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.74% +[2024-07-25 03:16:06 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 03:16:07 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 03:16:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][0/625] eta 0:09:21 lr 0.000802 wd 0.0500 time 0.8982 (0.8982) data time 0.3791 (0.3791) model time 0.0000 (0.0000) loss 8.5728 (8.5728) grad_norm 2.4830 (2.4830) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:16:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][10/625] eta 0:06:11 lr 0.000802 wd 0.0500 time 0.5678 (0.6037) data time 0.0006 (0.0352) model time 0.0000 (0.0000) loss 8.2784 (7.9148) grad_norm 4.3076 (2.6704) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:16:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][20/625] eta 0:05:56 lr 0.000802 wd 0.0500 time 0.5697 (0.5892) data time 0.0006 (0.0188) model time 0.0000 (0.0000) loss 6.3759 (7.6736) grad_norm 2.5724 (2.3548) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:16:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][30/625] eta 0:05:47 lr 0.000802 wd 0.0500 time 0.5667 (0.5846) data time 0.0006 (0.0131) model time 0.0000 (0.0000) loss 6.7378 (7.5463) grad_norm 1.8872 (2.2173) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:16:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][40/625] eta 0:05:40 lr 0.000802 wd 0.0500 time 0.5668 (0.5819) data time 0.0006 (0.0101) model time 0.0000 (0.0000) loss 6.0070 (7.5208) grad_norm 3.1662 (2.2251) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:16:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][50/625] eta 0:05:33 lr 0.000802 wd 0.0500 time 0.5749 (0.5807) data time 0.0006 (0.0083) model time 0.0000 (0.0000) loss 7.9068 (7.5335) grad_norm 2.0471 (2.1663) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:16:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][60/625] eta 0:05:27 lr 0.000802 wd 0.0500 time 0.5723 (0.5800) data time 0.0008 (0.0071) model time 0.5715 (0.5755) loss 7.4925 (7.5318) grad_norm 1.7602 (2.1903) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:16:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][70/625] eta 0:05:21 lr 0.000801 wd 0.0500 time 0.5678 (0.5794) data time 0.0006 (0.0062) model time 0.5671 (0.5751) loss 7.0833 (7.5442) grad_norm 2.1759 (2.1876) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:16:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][80/625] eta 0:05:15 lr 0.000801 wd 0.0500 time 0.5720 (0.5787) data time 0.0007 (0.0055) model time 0.5714 (0.5744) loss 7.3422 (7.5725) grad_norm 2.7697 (2.1901) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:17:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][90/625] eta 0:05:09 lr 0.000801 wd 0.0500 time 0.5558 (0.5782) data time 0.0010 (0.0051) model time 0.5548 (0.5739) loss 8.9427 (7.5748) grad_norm 2.7850 (2.1905) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:17:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][100/625] eta 0:05:03 lr 0.000801 wd 0.0500 time 0.5746 (0.5780) data time 0.0008 (0.0047) model time 0.5738 (0.5741) loss 8.6903 (7.6029) grad_norm 1.8233 (2.1869) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:17:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][110/625] eta 0:04:57 lr 0.000801 wd 0.0500 time 0.5744 (0.5780) data time 0.0007 (0.0043) model time 0.5737 (0.5747) loss 6.4416 (7.5891) grad_norm 1.7530 (2.1722) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:17:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][120/625] eta 0:04:51 lr 0.000801 wd 0.0500 time 0.5736 (0.5780) data time 0.0007 (0.0041) model time 0.5729 (0.5750) loss 6.8202 (7.5108) grad_norm 2.1266 (2.1393) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:17:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][130/625] eta 0:04:46 lr 0.000801 wd 0.0500 time 0.5728 (0.5778) data time 0.0006 (0.0038) model time 0.5722 (0.5751) loss 7.6292 (7.4904) grad_norm 1.9090 (2.1406) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:17:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][140/625] eta 0:04:40 lr 0.000801 wd 0.0500 time 0.5732 (0.5777) data time 0.0009 (0.0036) model time 0.5723 (0.5750) loss 9.1900 (7.4825) grad_norm 3.2369 (2.1632) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:17:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][150/625] eta 0:04:34 lr 0.000801 wd 0.0500 time 0.5702 (0.5777) data time 0.0007 (0.0034) model time 0.5695 (0.5752) loss 6.6933 (7.4937) grad_norm 1.6573 (2.1517) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:17:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][160/625] eta 0:04:28 lr 0.000801 wd 0.0500 time 0.5716 (0.5779) data time 0.0008 (0.0032) model time 0.5709 (0.5757) loss 7.7130 (7.5106) grad_norm 1.9018 (2.1332) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:17:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][170/625] eta 0:04:23 lr 0.000800 wd 0.0500 time 0.5628 (0.5786) data time 0.0007 (0.0031) model time 0.5621 (0.5768) loss 6.8142 (7.5310) grad_norm 4.3612 (2.1555) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:17:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][180/625] eta 0:04:19 lr 0.000800 wd 0.0500 time 0.7492 (0.5835) data time 0.0008 (0.0030) model time 0.7484 (0.5836) loss 8.6479 (7.5394) grad_norm 2.1598 (2.1616) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:17:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][190/625] eta 0:04:14 lr 0.000800 wd 0.0500 time 0.5727 (0.5859) data time 0.0006 (0.0029) model time 0.5721 (0.5869) loss 5.9768 (7.4985) grad_norm 2.1304 (2.1542) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:18:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][200/625] eta 0:04:10 lr 0.000800 wd 0.0500 time 0.5717 (0.5891) data time 0.0007 (0.0028) model time 0.5710 (0.5911) loss 7.2511 (7.5199) grad_norm 1.7416 (2.1340) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:18:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][210/625] eta 0:04:04 lr 0.000800 wd 0.0500 time 0.5740 (0.5887) data time 0.0008 (0.0027) model time 0.5732 (0.5903) loss 7.0624 (7.5418) grad_norm 2.8407 (2.1384) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:18:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][220/625] eta 0:03:58 lr 0.000800 wd 0.0500 time 0.5725 (0.5880) data time 0.0008 (0.0026) model time 0.5717 (0.5894) loss 7.0408 (7.5722) grad_norm 4.6949 (2.1721) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:18:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][230/625] eta 0:03:52 lr 0.000800 wd 0.0500 time 0.5743 (0.5875) data time 0.0008 (0.0025) model time 0.5735 (0.5885) loss 7.3446 (7.5847) grad_norm 1.7541 (2.1686) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:18:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][240/625] eta 0:03:45 lr 0.000800 wd 0.0500 time 0.5712 (0.5869) data time 0.0006 (0.0024) model time 0.5706 (0.5877) loss 5.8103 (7.5815) grad_norm 1.9416 (2.1584) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:18:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][250/625] eta 0:03:39 lr 0.000800 wd 0.0500 time 0.5633 (0.5864) data time 0.0006 (0.0024) model time 0.5627 (0.5870) loss 7.1427 (7.5934) grad_norm inf (inf) loss_scale 1024.0000 (2043.9203) mem 22339MB +[2024-07-25 03:18:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][260/625] eta 0:03:33 lr 0.000800 wd 0.0500 time 0.5685 (0.5859) data time 0.0006 (0.0023) model time 0.5679 (0.5863) loss 6.1317 (7.5922) grad_norm 3.4418 (inf) loss_scale 1024.0000 (2004.8429) mem 22339MB +[2024-07-25 03:18:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][270/625] eta 0:03:27 lr 0.000799 wd 0.0500 time 0.5757 (0.5858) data time 0.0006 (0.0023) model time 0.5751 (0.5861) loss 6.3889 (7.5799) grad_norm 1.7599 (inf) loss_scale 1024.0000 (1968.6494) mem 22339MB +[2024-07-25 03:18:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][280/625] eta 0:03:21 lr 0.000799 wd 0.0500 time 0.5722 (0.5854) data time 0.0008 (0.0022) model time 0.5714 (0.5856) loss 8.3139 (7.5783) grad_norm 2.7088 (inf) loss_scale 1024.0000 (1935.0320) mem 22339MB +[2024-07-25 03:18:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][290/625] eta 0:03:16 lr 0.000799 wd 0.0500 time 0.5715 (0.5851) data time 0.0007 (0.0022) model time 0.5709 (0.5852) loss 5.7304 (7.5730) grad_norm 2.4453 (inf) loss_scale 1024.0000 (1903.7251) mem 22339MB +[2024-07-25 03:19:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][300/625] eta 0:03:10 lr 0.000799 wd 0.0500 time 0.5622 (0.5848) data time 0.0008 (0.0021) model time 0.5614 (0.5848) loss 6.2922 (7.5786) grad_norm 1.5254 (inf) loss_scale 1024.0000 (1874.4983) mem 22339MB +[2024-07-25 03:19:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][310/625] eta 0:03:04 lr 0.000799 wd 0.0500 time 0.5679 (0.5845) data time 0.0008 (0.0021) model time 0.5670 (0.5844) loss 6.6691 (7.5785) grad_norm 1.6934 (inf) loss_scale 1024.0000 (1847.1511) mem 22339MB +[2024-07-25 03:19:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][320/625] eta 0:02:58 lr 0.000799 wd 0.0500 time 0.5724 (0.5842) data time 0.0006 (0.0020) model time 0.5718 (0.5840) loss 6.1878 (7.5792) grad_norm 2.0552 (inf) loss_scale 1024.0000 (1821.5078) mem 22339MB +[2024-07-25 03:19:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][330/625] eta 0:02:52 lr 0.000799 wd 0.0500 time 0.5720 (0.5839) data time 0.0008 (0.0020) model time 0.5713 (0.5836) loss 8.5002 (7.5738) grad_norm 1.6232 (inf) loss_scale 1024.0000 (1797.4139) mem 22339MB +[2024-07-25 03:19:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][340/625] eta 0:02:46 lr 0.000799 wd 0.0500 time 0.5628 (0.5837) data time 0.0008 (0.0020) model time 0.5621 (0.5833) loss 8.5616 (7.5960) grad_norm 2.0948 (inf) loss_scale 1024.0000 (1774.7331) mem 22339MB +[2024-07-25 03:19:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][350/625] eta 0:02:40 lr 0.000799 wd 0.0500 time 0.5736 (0.5835) data time 0.0008 (0.0020) model time 0.5728 (0.5830) loss 7.6724 (7.5927) grad_norm 2.9003 (inf) loss_scale 1024.0000 (1753.3447) mem 22339MB +[2024-07-25 03:19:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][360/625] eta 0:02:34 lr 0.000799 wd 0.0500 time 0.5726 (0.5833) data time 0.0008 (0.0019) model time 0.5719 (0.5828) loss 6.0352 (7.5885) grad_norm 2.8681 (inf) loss_scale 1024.0000 (1733.1413) mem 22339MB +[2024-07-25 03:19:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][370/625] eta 0:02:28 lr 0.000798 wd 0.0500 time 0.5675 (0.5831) data time 0.0006 (0.0019) model time 0.5669 (0.5826) loss 7.7276 (7.5925) grad_norm 2.4214 (inf) loss_scale 1024.0000 (1714.0270) mem 22339MB +[2024-07-25 03:19:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][380/625] eta 0:02:22 lr 0.000798 wd 0.0500 time 0.5714 (0.5831) data time 0.0008 (0.0019) model time 0.5706 (0.5826) loss 7.9230 (7.5808) grad_norm 2.5770 (inf) loss_scale 1024.0000 (1695.9160) mem 22339MB +[2024-07-25 03:19:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][390/625] eta 0:02:17 lr 0.000798 wd 0.0500 time 0.5699 (0.5831) data time 0.0008 (0.0018) model time 0.5691 (0.5826) loss 8.2186 (7.5916) grad_norm 1.5605 (inf) loss_scale 1024.0000 (1678.7315) mem 22339MB +[2024-07-25 03:20:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][400/625] eta 0:02:11 lr 0.000798 wd 0.0500 time 0.5664 (0.5848) data time 0.0006 (0.0018) model time 0.5658 (0.5845) loss 6.9054 (7.5799) grad_norm 2.0026 (inf) loss_scale 1024.0000 (1662.4040) mem 22339MB +[2024-07-25 03:20:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][410/625] eta 0:02:05 lr 0.000798 wd 0.0500 time 0.5730 (0.5857) data time 0.0007 (0.0018) model time 0.5723 (0.5856) loss 7.4706 (7.5830) grad_norm 1.9886 (inf) loss_scale 1024.0000 (1646.8710) mem 22339MB +[2024-07-25 03:20:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][420/625] eta 0:02:00 lr 0.000798 wd 0.0500 time 0.5626 (0.5873) data time 0.0006 (0.0018) model time 0.5620 (0.5873) loss 7.6209 (7.5809) grad_norm 2.4655 (inf) loss_scale 1024.0000 (1632.0760) mem 22339MB +[2024-07-25 03:20:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][430/625] eta 0:01:54 lr 0.000798 wd 0.0500 time 0.5613 (0.5873) data time 0.0006 (0.0018) model time 0.5607 (0.5873) loss 9.2140 (7.5831) grad_norm 2.4478 (inf) loss_scale 1024.0000 (1617.9675) mem 22339MB +[2024-07-25 03:20:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][440/625] eta 0:01:48 lr 0.000798 wd 0.0500 time 0.5697 (0.5870) data time 0.0006 (0.0017) model time 0.5691 (0.5869) loss 7.1952 (7.5853) grad_norm 1.9685 (inf) loss_scale 1024.0000 (1604.4989) mem 22339MB +[2024-07-25 03:20:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][450/625] eta 0:01:42 lr 0.000798 wd 0.0500 time 0.5682 (0.5867) data time 0.0006 (0.0017) model time 0.5676 (0.5866) loss 7.8711 (7.5911) grad_norm 2.3487 (inf) loss_scale 1024.0000 (1591.6275) mem 22339MB +[2024-07-25 03:20:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][460/625] eta 0:01:36 lr 0.000798 wd 0.0500 time 0.5716 (0.5865) data time 0.0008 (0.0017) model time 0.5708 (0.5864) loss 7.3797 (7.5740) grad_norm 1.7898 (inf) loss_scale 1024.0000 (1579.3145) mem 22339MB +[2024-07-25 03:20:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][470/625] eta 0:01:30 lr 0.000797 wd 0.0500 time 0.5632 (0.5863) data time 0.0006 (0.0017) model time 0.5626 (0.5861) loss 7.9355 (7.5638) grad_norm 2.9818 (inf) loss_scale 1024.0000 (1567.5244) mem 22339MB +[2024-07-25 03:20:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][480/625] eta 0:01:24 lr 0.000797 wd 0.0500 time 0.5718 (0.5861) data time 0.0007 (0.0017) model time 0.5711 (0.5858) loss 8.2250 (7.5669) grad_norm 2.0433 (inf) loss_scale 1024.0000 (1556.2245) mem 22339MB +[2024-07-25 03:20:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][490/625] eta 0:01:19 lr 0.000797 wd 0.0500 time 0.5655 (0.5859) data time 0.0006 (0.0017) model time 0.5649 (0.5856) loss 6.3586 (7.5654) grad_norm 1.5122 (inf) loss_scale 1024.0000 (1545.3849) mem 22339MB +[2024-07-25 03:21:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][500/625] eta 0:01:13 lr 0.000797 wd 0.0500 time 0.5660 (0.5857) data time 0.0008 (0.0016) model time 0.5653 (0.5853) loss 6.6988 (7.5631) grad_norm 4.0750 (inf) loss_scale 1024.0000 (1534.9780) mem 22339MB +[2024-07-25 03:21:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][510/625] eta 0:01:07 lr 0.000797 wd 0.0500 time 0.5749 (0.5855) data time 0.0008 (0.0016) model time 0.5741 (0.5852) loss 7.8051 (7.5637) grad_norm 2.3417 (inf) loss_scale 1024.0000 (1524.9785) mem 22339MB +[2024-07-25 03:21:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][520/625] eta 0:01:01 lr 0.000797 wd 0.0500 time 0.5698 (0.5853) data time 0.0008 (0.0016) model time 0.5690 (0.5849) loss 7.5575 (7.5558) grad_norm 2.0379 (inf) loss_scale 1024.0000 (1515.3628) mem 22339MB +[2024-07-25 03:21:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][530/625] eta 0:00:55 lr 0.000797 wd 0.0500 time 0.5719 (0.5851) data time 0.0009 (0.0016) model time 0.5710 (0.5847) loss 6.8052 (7.5464) grad_norm 2.3892 (inf) loss_scale 1024.0000 (1506.1092) mem 22339MB +[2024-07-25 03:21:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][540/625] eta 0:00:49 lr 0.000797 wd 0.0500 time 0.5673 (0.5851) data time 0.0006 (0.0016) model time 0.5667 (0.5847) loss 8.4143 (7.5425) grad_norm 2.1352 (inf) loss_scale 1024.0000 (1497.1978) mem 22339MB +[2024-07-25 03:21:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][550/625] eta 0:00:43 lr 0.000797 wd 0.0500 time 0.5634 (0.5850) data time 0.0008 (0.0016) model time 0.5626 (0.5845) loss 7.5183 (7.5459) grad_norm 2.4210 (inf) loss_scale 1024.0000 (1488.6098) mem 22339MB +[2024-07-25 03:21:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][560/625] eta 0:00:38 lr 0.000797 wd 0.0500 time 0.5704 (0.5849) data time 0.0006 (0.0016) model time 0.5698 (0.5844) loss 8.8818 (7.5399) grad_norm 2.3018 (inf) loss_scale 1024.0000 (1480.3280) mem 22339MB +[2024-07-25 03:21:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][570/625] eta 0:00:32 lr 0.000796 wd 0.0500 time 0.5739 (0.5848) data time 0.0008 (0.0016) model time 0.5731 (0.5842) loss 7.7811 (7.5478) grad_norm 2.4134 (inf) loss_scale 1024.0000 (1472.3363) mem 22339MB +[2024-07-25 03:21:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][580/625] eta 0:00:26 lr 0.000796 wd 0.0500 time 0.5730 (0.5847) data time 0.0006 (0.0016) model time 0.5724 (0.5841) loss 9.1921 (7.5516) grad_norm 2.4293 (inf) loss_scale 1024.0000 (1464.6196) mem 22339MB +[2024-07-25 03:21:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][590/625] eta 0:00:20 lr 0.000796 wd 0.0500 time 0.5715 (0.5846) data time 0.0008 (0.0016) model time 0.5707 (0.5840) loss 6.9993 (7.5573) grad_norm 2.3850 (inf) loss_scale 1024.0000 (1457.1641) mem 22339MB +[2024-07-25 03:21:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][600/625] eta 0:00:14 lr 0.000796 wd 0.0500 time 0.5641 (0.5846) data time 0.0006 (0.0016) model time 0.5635 (0.5840) loss 8.9053 (7.5507) grad_norm 1.5699 (inf) loss_scale 1024.0000 (1449.9567) mem 22339MB +[2024-07-25 03:22:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][610/625] eta 0:00:08 lr 0.000796 wd 0.0500 time 0.7149 (0.5849) data time 0.0006 (0.0016) model time 0.7143 (0.5843) loss 7.5213 (7.5505) grad_norm 2.2949 (inf) loss_scale 1024.0000 (1442.9853) mem 22339MB +[2024-07-25 03:22:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [130/300][620/625] eta 0:00:02 lr 0.000796 wd 0.0500 time 0.6915 (0.5858) data time 0.0005 (0.0015) model time 0.6910 (0.5853) loss 7.8749 (7.5564) grad_norm 2.3582 (inf) loss_scale 1024.0000 (1436.2383) mem 22339MB +[2024-07-25 03:22:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 130 training takes 0:06:06 +[2024-07-25 03:22:13 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 03:22:15 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 03:22:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.476 (0.476) Loss 0.5229 (0.5229) Acc@1 89.258 (89.258) Acc@5 98.340 (98.340) Mem 22339MB +[2024-07-25 03:22:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.8281 (0.6602) Acc@1 80.664 (85.627) Acc@5 95.898 (97.474) Mem 22339MB +[2024-07-25 03:22:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9741 (0.7802) Acc@1 75.098 (82.347) Acc@5 94.629 (96.250) Mem 22339MB +[2024-07-25 03:22:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.064 Acc@5 96.249 +[2024-07-25 03:22:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.1% +[2024-07-25 03:22:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.864 (0.864) Loss 0.5015 (0.5015) Acc@1 89.014 (89.014) Acc@5 98.584 (98.584) Mem 22339MB +[2024-07-25 03:22:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.193) Loss 0.7964 (0.6311) Acc@1 80.908 (86.337) Acc@5 96.191 (97.696) Mem 22339MB +[2024-07-25 03:22:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.161) Loss 0.9155 (0.7397) Acc@1 77.539 (83.073) Acc@5 95.557 (96.617) Mem 22339MB +[2024-07-25 03:22:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.748 Acc@5 96.623 +[2024-07-25 03:22:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.7% +[2024-07-25 03:22:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.75% +[2024-07-25 03:22:22 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 03:22:24 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 03:22:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][0/625] eta 0:09:45 lr 0.000796 wd 0.0500 time 0.9360 (0.9360) data time 0.4170 (0.4170) model time 0.0000 (0.0000) loss 7.4854 (7.4854) grad_norm 2.4134 (2.4134) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:22:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][10/625] eta 0:06:56 lr 0.000796 wd 0.0500 time 0.7376 (0.6780) data time 0.0008 (0.0386) model time 0.0000 (0.0000) loss 8.1030 (7.4128) grad_norm 3.2386 (2.3618) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:22:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][20/625] eta 0:06:32 lr 0.000796 wd 0.0500 time 0.6612 (0.6492) data time 0.0009 (0.0206) model time 0.0000 (0.0000) loss 8.0762 (7.3504) grad_norm 1.7369 (2.3711) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:22:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][30/625] eta 0:06:12 lr 0.000796 wd 0.0500 time 0.5724 (0.6253) data time 0.0008 (0.0142) model time 0.0000 (0.0000) loss 8.8478 (7.3377) grad_norm 2.2932 (2.3358) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:22:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][40/625] eta 0:05:58 lr 0.000795 wd 0.0500 time 0.5714 (0.6126) data time 0.0006 (0.0110) model time 0.0000 (0.0000) loss 8.1329 (7.4443) grad_norm 3.1935 (2.4918) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:22:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][50/625] eta 0:05:47 lr 0.000795 wd 0.0500 time 0.5673 (0.6051) data time 0.0008 (0.0090) model time 0.0000 (0.0000) loss 5.9385 (7.3671) grad_norm 3.0372 (2.5403) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:23:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][60/625] eta 0:05:39 lr 0.000795 wd 0.0500 time 0.5715 (0.6001) data time 0.0008 (0.0076) model time 0.5707 (0.5736) loss 8.3310 (7.3780) grad_norm 1.8987 (2.5001) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:23:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][70/625] eta 0:05:31 lr 0.000795 wd 0.0500 time 0.5734 (0.5967) data time 0.0009 (0.0067) model time 0.5725 (0.5744) loss 6.3501 (7.3419) grad_norm 1.7972 (2.4368) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:23:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][80/625] eta 0:05:23 lr 0.000795 wd 0.0500 time 0.5721 (0.5939) data time 0.0008 (0.0060) model time 0.5712 (0.5741) loss 6.5136 (7.3663) grad_norm 2.0753 (2.3945) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:23:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][90/625] eta 0:05:16 lr 0.000795 wd 0.0500 time 0.5703 (0.5917) data time 0.0006 (0.0054) model time 0.5697 (0.5738) loss 8.0916 (7.4324) grad_norm 2.5084 (2.3803) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:23:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][100/625] eta 0:05:10 lr 0.000795 wd 0.0500 time 0.5619 (0.5907) data time 0.0006 (0.0049) model time 0.5613 (0.5751) loss 8.2494 (7.4743) grad_norm 2.0936 (2.3954) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:23:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][110/625] eta 0:05:03 lr 0.000795 wd 0.0500 time 0.5684 (0.5892) data time 0.0008 (0.0046) model time 0.5676 (0.5749) loss 5.6069 (7.4629) grad_norm 2.1726 (2.4031) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:23:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][120/625] eta 0:04:56 lr 0.000795 wd 0.0500 time 0.5755 (0.5880) data time 0.0006 (0.0043) model time 0.5749 (0.5747) loss 6.3493 (7.4676) grad_norm 2.5400 (2.3828) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:23:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][130/625] eta 0:04:50 lr 0.000795 wd 0.0500 time 0.5794 (0.5878) data time 0.0006 (0.0040) model time 0.5788 (0.5760) loss 8.0881 (7.4773) grad_norm 2.3077 (2.3761) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:23:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][140/625] eta 0:04:44 lr 0.000794 wd 0.0500 time 0.5722 (0.5870) data time 0.0009 (0.0038) model time 0.5713 (0.5760) loss 7.4249 (7.4800) grad_norm 1.9149 (2.3458) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:23:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][150/625] eta 0:04:38 lr 0.000794 wd 0.0500 time 0.5740 (0.5863) data time 0.0006 (0.0036) model time 0.5734 (0.5759) loss 6.4935 (7.4611) grad_norm 3.3198 (2.3265) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:23:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][160/625] eta 0:04:32 lr 0.000794 wd 0.0500 time 0.5722 (0.5857) data time 0.0008 (0.0034) model time 0.5714 (0.5759) loss 7.3882 (7.4789) grad_norm 2.1305 (2.3159) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:24:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][170/625] eta 0:04:26 lr 0.000794 wd 0.0500 time 0.5758 (0.5851) data time 0.0006 (0.0032) model time 0.5752 (0.5758) loss 8.2818 (7.4788) grad_norm 1.7253 (2.3321) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:24:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][180/625] eta 0:04:20 lr 0.000794 wd 0.0500 time 0.5615 (0.5848) data time 0.0007 (0.0031) model time 0.5609 (0.5760) loss 8.7420 (7.5052) grad_norm 2.2798 (2.3404) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:24:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][190/625] eta 0:04:14 lr 0.000794 wd 0.0500 time 0.5624 (0.5843) data time 0.0009 (0.0030) model time 0.5615 (0.5759) loss 9.4320 (7.4956) grad_norm 4.3033 (2.3754) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:24:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][200/625] eta 0:04:08 lr 0.000794 wd 0.0500 time 0.5751 (0.5845) data time 0.0009 (0.0029) model time 0.5742 (0.5766) loss 8.2260 (7.5102) grad_norm 2.7227 (2.3724) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:24:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][210/625] eta 0:04:03 lr 0.000794 wd 0.0500 time 0.7284 (0.5861) data time 0.0008 (0.0028) model time 0.7276 (0.5793) loss 6.6475 (7.5095) grad_norm 1.6314 (2.3561) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:24:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][220/625] eta 0:03:58 lr 0.000794 wd 0.0500 time 0.7074 (0.5882) data time 0.0007 (0.0027) model time 0.7067 (0.5823) loss 7.8830 (7.5087) grad_norm 1.6483 (2.3463) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:24:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][230/625] eta 0:03:52 lr 0.000794 wd 0.0500 time 0.7615 (0.5899) data time 0.0006 (0.0026) model time 0.7609 (0.5847) loss 8.1095 (7.5309) grad_norm 2.0162 (2.3277) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:24:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][240/625] eta 0:03:47 lr 0.000793 wd 0.0500 time 0.5637 (0.5913) data time 0.0006 (0.0025) model time 0.5631 (0.5868) loss 7.9500 (7.5379) grad_norm 1.6776 (2.3103) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:24:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][250/625] eta 0:03:41 lr 0.000793 wd 0.0500 time 0.5743 (0.5907) data time 0.0008 (0.0025) model time 0.5735 (0.5862) loss 7.9076 (7.5493) grad_norm 3.4689 (2.3475) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:24:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][260/625] eta 0:03:35 lr 0.000793 wd 0.0500 time 0.5728 (0.5902) data time 0.0008 (0.0024) model time 0.5720 (0.5858) loss 6.1142 (7.5132) grad_norm 3.1988 (2.3434) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:25:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][270/625] eta 0:03:29 lr 0.000793 wd 0.0500 time 0.5702 (0.5897) data time 0.0006 (0.0023) model time 0.5696 (0.5853) loss 7.9870 (7.5186) grad_norm 1.6629 (2.3291) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:25:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][280/625] eta 0:03:23 lr 0.000793 wd 0.0500 time 0.5617 (0.5893) data time 0.0007 (0.0024) model time 0.5611 (0.5849) loss 6.1871 (7.4984) grad_norm 2.6784 (2.3324) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:25:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][290/625] eta 0:03:17 lr 0.000793 wd 0.0500 time 0.5734 (0.5888) data time 0.0008 (0.0023) model time 0.5727 (0.5845) loss 7.1114 (7.5010) grad_norm 1.8260 (2.3257) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:25:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][300/625] eta 0:03:11 lr 0.000793 wd 0.0500 time 0.5687 (0.5884) data time 0.0008 (0.0023) model time 0.5678 (0.5841) loss 8.7131 (7.5056) grad_norm 2.5057 (2.3167) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:25:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][310/625] eta 0:03:05 lr 0.000793 wd 0.0500 time 0.5722 (0.5880) data time 0.0006 (0.0022) model time 0.5717 (0.5837) loss 7.6792 (7.4959) grad_norm 2.1841 (2.3063) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:25:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][320/625] eta 0:02:59 lr 0.000793 wd 0.0500 time 0.5689 (0.5876) data time 0.0006 (0.0022) model time 0.5683 (0.5834) loss 7.3181 (7.4991) grad_norm 3.5065 (2.3115) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:25:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][330/625] eta 0:02:53 lr 0.000793 wd 0.0500 time 0.5735 (0.5873) data time 0.0008 (0.0021) model time 0.5727 (0.5831) loss 6.6970 (7.4837) grad_norm 1.7648 (2.3128) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:25:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][340/625] eta 0:02:47 lr 0.000792 wd 0.0500 time 0.5625 (0.5870) data time 0.0006 (0.0021) model time 0.5619 (0.5829) loss 6.6521 (7.4859) grad_norm 1.9884 (2.3026) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:25:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][350/625] eta 0:02:41 lr 0.000792 wd 0.0500 time 0.5731 (0.5871) data time 0.0007 (0.0021) model time 0.5724 (0.5832) loss 8.2975 (7.4811) grad_norm 2.2222 (2.2927) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:25:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][360/625] eta 0:02:35 lr 0.000792 wd 0.0500 time 0.5640 (0.5870) data time 0.0008 (0.0020) model time 0.5632 (0.5831) loss 6.8425 (7.4760) grad_norm 1.9138 (2.2961) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:26:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][370/625] eta 0:02:29 lr 0.000792 wd 0.0500 time 0.5631 (0.5869) data time 0.0006 (0.0020) model time 0.5624 (0.5831) loss 6.6791 (7.4702) grad_norm 1.6442 (2.2892) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:26:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][380/625] eta 0:02:23 lr 0.000792 wd 0.0500 time 0.5636 (0.5867) data time 0.0008 (0.0020) model time 0.5628 (0.5830) loss 7.3736 (7.4733) grad_norm 1.9031 (2.2872) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:26:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][390/625] eta 0:02:17 lr 0.000792 wd 0.0500 time 0.5716 (0.5866) data time 0.0008 (0.0019) model time 0.5708 (0.5829) loss 6.9025 (7.4703) grad_norm 2.1026 (2.2823) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:26:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][400/625] eta 0:02:11 lr 0.000792 wd 0.0500 time 0.5735 (0.5863) data time 0.0006 (0.0019) model time 0.5730 (0.5826) loss 8.1649 (7.4808) grad_norm 3.2666 (2.2969) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:26:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][410/625] eta 0:02:06 lr 0.000792 wd 0.0500 time 0.5641 (0.5862) data time 0.0007 (0.0019) model time 0.5635 (0.5826) loss 7.1295 (7.4832) grad_norm 1.6783 (2.3008) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:26:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][420/625] eta 0:02:00 lr 0.000792 wd 0.0500 time 0.5626 (0.5866) data time 0.0008 (0.0019) model time 0.5617 (0.5832) loss 8.2208 (7.4844) grad_norm 3.0788 (2.3118) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:26:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][430/625] eta 0:01:54 lr 0.000792 wd 0.0500 time 0.5637 (0.5873) data time 0.0006 (0.0019) model time 0.5631 (0.5840) loss 7.8038 (7.4810) grad_norm 2.6222 (2.3106) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:26:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][440/625] eta 0:01:48 lr 0.000791 wd 0.0500 time 0.7099 (0.5883) data time 0.0006 (0.0018) model time 0.7092 (0.5852) loss 6.4499 (7.4691) grad_norm 1.9936 (2.3194) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:26:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][450/625] eta 0:01:43 lr 0.000791 wd 0.0500 time 0.7803 (0.5891) data time 0.0009 (0.0018) model time 0.7794 (0.5862) loss 6.6215 (7.4691) grad_norm 1.4994 (2.3149) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:26:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][460/625] eta 0:01:37 lr 0.000791 wd 0.0500 time 0.7059 (0.5901) data time 0.0006 (0.0018) model time 0.7053 (0.5873) loss 8.9079 (7.4739) grad_norm 1.7001 (2.3172) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:27:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][470/625] eta 0:01:31 lr 0.000791 wd 0.0500 time 0.5722 (0.5898) data time 0.0008 (0.0018) model time 0.5714 (0.5870) loss 7.0538 (7.4736) grad_norm 2.7856 (2.3119) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:27:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][480/625] eta 0:01:25 lr 0.000791 wd 0.0500 time 0.5718 (0.5895) data time 0.0007 (0.0018) model time 0.5711 (0.5867) loss 7.9087 (7.4743) grad_norm 1.5663 (2.3057) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:27:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][490/625] eta 0:01:19 lr 0.000791 wd 0.0500 time 0.5613 (0.5892) data time 0.0008 (0.0017) model time 0.5605 (0.5864) loss 5.5407 (7.4682) grad_norm 2.1742 (2.2999) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:27:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][500/625] eta 0:01:13 lr 0.000791 wd 0.0500 time 0.5722 (0.5889) data time 0.0006 (0.0017) model time 0.5716 (0.5862) loss 7.6727 (7.4745) grad_norm 2.7226 (2.2987) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:27:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][510/625] eta 0:01:07 lr 0.000791 wd 0.0500 time 0.5750 (0.5887) data time 0.0008 (0.0017) model time 0.5743 (0.5859) loss 8.1104 (7.4678) grad_norm 1.8782 (2.2932) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:27:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][520/625] eta 0:01:01 lr 0.000791 wd 0.0500 time 0.5754 (0.5884) data time 0.0008 (0.0017) model time 0.5746 (0.5857) loss 6.8395 (7.4687) grad_norm 2.2159 (2.2898) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:27:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][530/625] eta 0:00:55 lr 0.000791 wd 0.0500 time 0.5641 (0.5882) data time 0.0007 (0.0017) model time 0.5635 (0.5855) loss 5.7784 (7.4678) grad_norm 1.8655 (2.2899) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:27:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][540/625] eta 0:00:49 lr 0.000790 wd 0.0500 time 0.5625 (0.5879) data time 0.0006 (0.0017) model time 0.5618 (0.5853) loss 7.4876 (7.4685) grad_norm 2.2553 (2.2875) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:27:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][550/625] eta 0:00:44 lr 0.000790 wd 0.0500 time 0.5716 (0.5877) data time 0.0008 (0.0016) model time 0.5708 (0.5850) loss 8.3847 (7.4770) grad_norm 3.3069 (2.2859) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:27:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][560/625] eta 0:00:38 lr 0.000790 wd 0.0500 time 0.5692 (0.5875) data time 0.0007 (0.0016) model time 0.5685 (0.5849) loss 8.2757 (7.4830) grad_norm 3.1075 (2.2959) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:27:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][570/625] eta 0:00:32 lr 0.000790 wd 0.0500 time 0.5738 (0.5874) data time 0.0008 (0.0016) model time 0.5730 (0.5848) loss 7.4383 (7.4813) grad_norm 2.1006 (2.2894) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:28:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][580/625] eta 0:00:26 lr 0.000790 wd 0.0500 time 0.5696 (0.5872) data time 0.0006 (0.0016) model time 0.5690 (0.5846) loss 5.9674 (7.4805) grad_norm 2.1279 (2.2893) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:28:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][590/625] eta 0:00:20 lr 0.000790 wd 0.0500 time 0.5736 (0.5870) data time 0.0006 (0.0016) model time 0.5730 (0.5845) loss 6.3612 (7.4877) grad_norm 1.8770 (2.2953) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:28:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][600/625] eta 0:00:14 lr 0.000790 wd 0.0500 time 0.5654 (0.5869) data time 0.0008 (0.0016) model time 0.5645 (0.5843) loss 7.9977 (7.4833) grad_norm 2.4457 (2.3027) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:28:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][610/625] eta 0:00:08 lr 0.000790 wd 0.0500 time 0.5713 (0.5867) data time 0.0004 (0.0016) model time 0.5709 (0.5841) loss 8.2041 (7.4794) grad_norm 1.8429 (2.3077) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:28:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [131/300][620/625] eta 0:00:02 lr 0.000790 wd 0.0500 time 0.5611 (0.5865) data time 0.0004 (0.0015) model time 0.5607 (0.5839) loss 7.4785 (7.4847) grad_norm 1.6878 (2.3026) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:28:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 131 training takes 0:06:06 +[2024-07-25 03:28:30 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 03:28:32 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 03:28:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.470 (0.470) Loss 0.5332 (0.5332) Acc@1 89.258 (89.258) Acc@5 98.633 (98.633) Mem 22339MB +[2024-07-25 03:28:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.8369 (0.6704) Acc@1 79.834 (85.738) Acc@5 95.850 (97.599) Mem 22339MB +[2024-07-25 03:28:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 1.0059 (0.7887) Acc@1 76.660 (82.538) Acc@5 93.604 (96.263) Mem 22339MB +[2024-07-25 03:28:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.224 Acc@5 96.279 +[2024-07-25 03:28:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.2% +[2024-07-25 03:28:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 82.22% +[2024-07-25 03:28:35 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 03:28:37 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 03:28:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.480 (0.480) Loss 0.5015 (0.5015) Acc@1 89.062 (89.062) Acc@5 98.584 (98.584) Mem 22339MB +[2024-07-25 03:28:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.7939 (0.6306) Acc@1 81.006 (86.346) Acc@5 96.240 (97.701) Mem 22339MB +[2024-07-25 03:28:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9150 (0.7391) Acc@1 77.344 (83.071) Acc@5 95.508 (96.626) Mem 22339MB +[2024-07-25 03:28:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.750 Acc@5 96.633 +[2024-07-25 03:28:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.7% +[2024-07-25 03:28:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.75% +[2024-07-25 03:28:40 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 03:28:41 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 03:28:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][0/625] eta 0:09:57 lr 0.000790 wd 0.0500 time 0.9553 (0.9553) data time 0.4380 (0.4380) model time 0.0000 (0.0000) loss 7.1938 (7.1938) grad_norm 2.6112 (2.6112) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:28:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][10/625] eta 0:06:15 lr 0.000789 wd 0.0500 time 0.5646 (0.6104) data time 0.0006 (0.0405) model time 0.0000 (0.0000) loss 8.7553 (7.9697) grad_norm 3.2989 (2.1504) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:28:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][20/625] eta 0:06:02 lr 0.000789 wd 0.0500 time 0.5717 (0.5997) data time 0.0006 (0.0216) model time 0.0000 (0.0000) loss 7.8996 (7.9264) grad_norm 2.0822 (2.2236) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:29:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][30/625] eta 0:06:03 lr 0.000789 wd 0.0500 time 0.7024 (0.6108) data time 0.0006 (0.0149) model time 0.0000 (0.0000) loss 8.0357 (7.8518) grad_norm 1.8349 (2.1418) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:29:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][40/625] eta 0:06:01 lr 0.000789 wd 0.0500 time 0.7487 (0.6179) data time 0.0006 (0.0115) model time 0.0000 (0.0000) loss 7.3771 (7.8544) grad_norm 2.0845 (2.1385) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:29:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][50/625] eta 0:05:57 lr 0.000789 wd 0.0500 time 0.7167 (0.6217) data time 0.0008 (0.0095) model time 0.0000 (0.0000) loss 8.3661 (7.8053) grad_norm 1.9917 (2.1463) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:29:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][60/625] eta 0:05:49 lr 0.000789 wd 0.0500 time 0.5672 (0.6182) data time 0.0008 (0.0080) model time 0.5664 (0.5996) loss 6.0680 (7.7177) grad_norm 2.9098 (2.1003) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:29:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][70/625] eta 0:05:40 lr 0.000789 wd 0.0500 time 0.5769 (0.6135) data time 0.0006 (0.0070) model time 0.5763 (0.5918) loss 7.5601 (7.6454) grad_norm 1.6997 (2.1143) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:29:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][80/625] eta 0:05:31 lr 0.000789 wd 0.0500 time 0.5756 (0.6089) data time 0.0008 (0.0062) model time 0.5748 (0.5864) loss 8.6732 (7.6400) grad_norm 2.7497 (2.1462) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:29:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][90/625] eta 0:05:23 lr 0.000789 wd 0.0500 time 0.5737 (0.6055) data time 0.0006 (0.0057) model time 0.5731 (0.5841) loss 7.8358 (7.6380) grad_norm 2.1094 (2.1980) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:29:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][100/625] eta 0:05:16 lr 0.000789 wd 0.0500 time 0.5615 (0.6028) data time 0.0007 (0.0052) model time 0.5608 (0.5828) loss 6.2493 (7.6222) grad_norm 2.4810 (2.2006) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:29:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][110/625] eta 0:05:09 lr 0.000788 wd 0.0500 time 0.5727 (0.6003) data time 0.0008 (0.0048) model time 0.5720 (0.5813) loss 7.5930 (7.6156) grad_norm 1.6000 (2.1631) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:29:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][120/625] eta 0:05:02 lr 0.000788 wd 0.0500 time 0.5647 (0.5983) data time 0.0008 (0.0045) model time 0.5638 (0.5804) loss 8.9998 (7.6381) grad_norm 1.9498 (2.1563) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:30:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][130/625] eta 0:04:55 lr 0.000788 wd 0.0500 time 0.5714 (0.5970) data time 0.0008 (0.0042) model time 0.5706 (0.5803) loss 8.3211 (7.6708) grad_norm 1.5958 (2.1418) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:30:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][140/625] eta 0:04:48 lr 0.000788 wd 0.0500 time 0.5693 (0.5954) data time 0.0010 (0.0040) model time 0.5683 (0.5797) loss 6.0990 (7.6455) grad_norm 1.7357 (2.1623) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:30:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][150/625] eta 0:04:42 lr 0.000788 wd 0.0500 time 0.5705 (0.5944) data time 0.0007 (0.0038) model time 0.5698 (0.5796) loss 8.6719 (7.6242) grad_norm 2.2149 (2.1780) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:30:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][160/625] eta 0:04:35 lr 0.000788 wd 0.0500 time 0.5704 (0.5933) data time 0.0008 (0.0037) model time 0.5696 (0.5790) loss 9.0259 (7.6015) grad_norm 1.7893 (2.1843) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:30:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][170/625] eta 0:04:29 lr 0.000788 wd 0.0500 time 0.5663 (0.5921) data time 0.0006 (0.0035) model time 0.5657 (0.5785) loss 7.6324 (7.6044) grad_norm 2.1010 (2.1833) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:30:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][180/625] eta 0:04:23 lr 0.000788 wd 0.0500 time 0.5726 (0.5912) data time 0.0006 (0.0034) model time 0.5720 (0.5782) loss 8.5610 (7.5918) grad_norm 1.7728 (2.2000) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:30:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][190/625] eta 0:04:16 lr 0.000788 wd 0.0500 time 0.5700 (0.5903) data time 0.0006 (0.0033) model time 0.5694 (0.5778) loss 6.9539 (7.5982) grad_norm 2.4883 (2.1960) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:30:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][200/625] eta 0:04:10 lr 0.000788 wd 0.0500 time 0.5709 (0.5895) data time 0.0006 (0.0031) model time 0.5703 (0.5776) loss 8.1727 (7.5930) grad_norm 3.8299 (2.2005) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:30:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][210/625] eta 0:04:04 lr 0.000787 wd 0.0500 time 0.5677 (0.5888) data time 0.0008 (0.0030) model time 0.5669 (0.5774) loss 7.3667 (7.6014) grad_norm 1.6015 (2.1949) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:30:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][220/625] eta 0:03:58 lr 0.000787 wd 0.0500 time 0.5723 (0.5882) data time 0.0006 (0.0029) model time 0.5717 (0.5772) loss 7.3828 (7.6007) grad_norm 3.0230 (2.1971) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:30:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][230/625] eta 0:03:52 lr 0.000787 wd 0.0500 time 0.5739 (0.5876) data time 0.0008 (0.0028) model time 0.5731 (0.5770) loss 7.0850 (7.5695) grad_norm 3.3828 (2.2001) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:31:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][240/625] eta 0:03:46 lr 0.000787 wd 0.0500 time 0.5725 (0.5878) data time 0.0008 (0.0027) model time 0.5717 (0.5778) loss 8.1382 (7.5653) grad_norm 1.6415 (2.1952) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:31:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][250/625] eta 0:03:41 lr 0.000787 wd 0.0500 time 0.6934 (0.5899) data time 0.0006 (0.0027) model time 0.6928 (0.5809) loss 7.3542 (7.5666) grad_norm 1.7109 (2.1844) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:31:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][260/625] eta 0:03:35 lr 0.000787 wd 0.0500 time 0.5735 (0.5906) data time 0.0006 (0.0026) model time 0.5728 (0.5821) loss 7.8298 (7.5712) grad_norm 1.6926 (2.1780) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:31:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][270/625] eta 0:03:29 lr 0.000787 wd 0.0500 time 0.5708 (0.5915) data time 0.0007 (0.0025) model time 0.5701 (0.5836) loss 6.2027 (7.5568) grad_norm 2.0456 (2.1771) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:31:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][280/625] eta 0:03:24 lr 0.000787 wd 0.0500 time 0.5757 (0.5922) data time 0.0008 (0.0025) model time 0.5749 (0.5847) loss 7.3378 (7.5684) grad_norm 1.6330 (2.1665) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:31:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][290/625] eta 0:03:18 lr 0.000787 wd 0.0500 time 0.5632 (0.5920) data time 0.0008 (0.0024) model time 0.5623 (0.5848) loss 9.5047 (7.5834) grad_norm 2.0406 (2.1650) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:31:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][300/625] eta 0:03:12 lr 0.000787 wd 0.0500 time 0.5691 (0.5914) data time 0.0008 (0.0024) model time 0.5683 (0.5844) loss 8.1940 (7.5876) grad_norm 1.7782 (2.1770) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:31:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][310/625] eta 0:03:06 lr 0.000786 wd 0.0500 time 0.5693 (0.5909) data time 0.0006 (0.0023) model time 0.5687 (0.5840) loss 8.4509 (7.5921) grad_norm 2.2270 (2.1848) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:31:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][320/625] eta 0:03:00 lr 0.000786 wd 0.0500 time 0.5683 (0.5905) data time 0.0008 (0.0023) model time 0.5675 (0.5837) loss 8.4318 (7.6068) grad_norm 2.3599 (2.1786) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:31:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][330/625] eta 0:02:54 lr 0.000786 wd 0.0500 time 0.5734 (0.5900) data time 0.0008 (0.0022) model time 0.5726 (0.5834) loss 7.4604 (7.6099) grad_norm 2.5781 (2.1760) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:32:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][340/625] eta 0:02:48 lr 0.000786 wd 0.0500 time 0.5749 (0.5896) data time 0.0008 (0.0022) model time 0.5741 (0.5831) loss 7.5709 (7.6114) grad_norm 2.3156 (2.1893) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:32:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][350/625] eta 0:02:42 lr 0.000786 wd 0.0500 time 0.5788 (0.5892) data time 0.0006 (0.0021) model time 0.5782 (0.5827) loss 7.7896 (7.6196) grad_norm 3.8488 (2.2065) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:32:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][360/625] eta 0:02:36 lr 0.000786 wd 0.0500 time 0.5690 (0.5888) data time 0.0006 (0.0021) model time 0.5684 (0.5825) loss 5.9774 (7.6146) grad_norm 3.6775 (2.2274) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:32:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][370/625] eta 0:02:30 lr 0.000786 wd 0.0500 time 0.5727 (0.5884) data time 0.0007 (0.0021) model time 0.5720 (0.5822) loss 6.9966 (7.6136) grad_norm 1.7764 (2.2345) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:32:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][380/625] eta 0:02:24 lr 0.000786 wd 0.0500 time 0.5742 (0.5881) data time 0.0008 (0.0020) model time 0.5734 (0.5820) loss 8.4254 (7.6082) grad_norm 4.1045 (2.2542) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:32:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][390/625] eta 0:02:18 lr 0.000786 wd 0.0500 time 0.5736 (0.5878) data time 0.0007 (0.0020) model time 0.5728 (0.5818) loss 5.9531 (7.5926) grad_norm 1.8120 (2.2530) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:32:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][400/625] eta 0:02:12 lr 0.000785 wd 0.0500 time 0.5710 (0.5875) data time 0.0006 (0.0020) model time 0.5704 (0.5816) loss 7.5763 (7.5909) grad_norm 2.7993 (2.2548) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:32:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][410/625] eta 0:02:06 lr 0.000785 wd 0.0500 time 0.5714 (0.5872) data time 0.0008 (0.0019) model time 0.5706 (0.5814) loss 7.5397 (7.6014) grad_norm 1.9677 (2.2602) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:32:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][420/625] eta 0:02:00 lr 0.000785 wd 0.0500 time 0.5737 (0.5869) data time 0.0008 (0.0019) model time 0.5728 (0.5812) loss 7.2086 (7.6111) grad_norm 2.1493 (2.2573) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:32:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][430/625] eta 0:01:54 lr 0.000785 wd 0.0500 time 0.5654 (0.5866) data time 0.0006 (0.0019) model time 0.5648 (0.5811) loss 8.0954 (7.6134) grad_norm 2.1263 (2.2510) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:33:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][440/625] eta 0:01:48 lr 0.000785 wd 0.0500 time 0.5773 (0.5864) data time 0.0008 (0.0019) model time 0.5765 (0.5809) loss 6.4913 (7.6052) grad_norm 1.5819 (2.2493) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:33:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][450/625] eta 0:01:42 lr 0.000785 wd 0.0500 time 0.5702 (0.5862) data time 0.0006 (0.0018) model time 0.5696 (0.5808) loss 8.0065 (7.6087) grad_norm 2.0411 (2.2534) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:33:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][460/625] eta 0:01:36 lr 0.000785 wd 0.0500 time 0.5747 (0.5861) data time 0.0008 (0.0018) model time 0.5739 (0.5808) loss 7.5173 (7.6131) grad_norm 1.4678 (2.2524) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:33:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][470/625] eta 0:01:30 lr 0.000785 wd 0.0500 time 0.5695 (0.5871) data time 0.0008 (0.0018) model time 0.5687 (0.5820) loss 7.3834 (7.6110) grad_norm 1.6800 (2.2467) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:33:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][480/625] eta 0:01:25 lr 0.000785 wd 0.0500 time 0.5663 (0.5879) data time 0.0010 (0.0018) model time 0.5653 (0.5830) loss 8.3584 (7.6120) grad_norm 2.4049 (2.2473) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:33:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][490/625] eta 0:01:19 lr 0.000785 wd 0.0500 time 0.7218 (0.5889) data time 0.0007 (0.0018) model time 0.7211 (0.5842) loss 7.5568 (7.6092) grad_norm 2.3121 (2.2509) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:33:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][500/625] eta 0:01:13 lr 0.000784 wd 0.0500 time 0.5822 (0.5893) data time 0.0006 (0.0017) model time 0.5816 (0.5847) loss 8.1530 (7.6021) grad_norm 3.5256 (2.2451) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:33:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][510/625] eta 0:01:07 lr 0.000784 wd 0.0500 time 0.5731 (0.5893) data time 0.0008 (0.0017) model time 0.5723 (0.5848) loss 7.5094 (7.6003) grad_norm 1.9609 (2.2488) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:33:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][520/625] eta 0:01:01 lr 0.000784 wd 0.0500 time 0.5720 (0.5890) data time 0.0008 (0.0017) model time 0.5712 (0.5846) loss 8.4495 (7.5981) grad_norm 2.8449 (2.2506) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:33:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][530/625] eta 0:00:55 lr 0.000784 wd 0.0500 time 0.5749 (0.5888) data time 0.0006 (0.0017) model time 0.5743 (0.5844) loss 9.1643 (7.6031) grad_norm 1.4834 (2.2449) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:34:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][540/625] eta 0:00:50 lr 0.000784 wd 0.0500 time 0.5723 (0.5885) data time 0.0006 (0.0017) model time 0.5717 (0.5842) loss 7.7155 (7.6022) grad_norm 2.2654 (2.2416) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:34:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][550/625] eta 0:00:44 lr 0.000784 wd 0.0500 time 0.5733 (0.5883) data time 0.0007 (0.0017) model time 0.5726 (0.5840) loss 7.5982 (7.6005) grad_norm 2.0757 (2.2394) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:34:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][560/625] eta 0:00:38 lr 0.000784 wd 0.0500 time 0.5729 (0.5881) data time 0.0008 (0.0016) model time 0.5721 (0.5838) loss 6.9055 (7.5981) grad_norm 3.5121 (2.2553) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:34:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][570/625] eta 0:00:32 lr 0.000784 wd 0.0500 time 0.5746 (0.5878) data time 0.0008 (0.0016) model time 0.5738 (0.5836) loss 7.9469 (7.6015) grad_norm 3.1382 (2.2607) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:34:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][580/625] eta 0:00:26 lr 0.000784 wd 0.0500 time 0.5718 (0.5876) data time 0.0009 (0.0016) model time 0.5709 (0.5835) loss 7.5284 (7.6086) grad_norm 2.0765 (2.2606) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:34:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][590/625] eta 0:00:20 lr 0.000784 wd 0.0500 time 0.5698 (0.5874) data time 0.0006 (0.0016) model time 0.5692 (0.5833) loss 7.4313 (7.5994) grad_norm 2.1902 (2.2594) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:34:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][600/625] eta 0:00:14 lr 0.000783 wd 0.0500 time 0.5707 (0.5872) data time 0.0008 (0.0016) model time 0.5699 (0.5832) loss 8.6576 (7.5989) grad_norm 1.9544 (2.2604) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:34:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][610/625] eta 0:00:08 lr 0.000783 wd 0.0500 time 0.5668 (0.5871) data time 0.0004 (0.0016) model time 0.5664 (0.5830) loss 7.8293 (7.6032) grad_norm 3.6857 (2.2654) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:34:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [132/300][620/625] eta 0:00:02 lr 0.000783 wd 0.0500 time 0.5719 (0.5869) data time 0.0004 (0.0016) model time 0.5715 (0.5829) loss 8.7328 (7.6052) grad_norm 2.3421 (2.2657) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:34:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 132 training takes 0:06:06 +[2024-07-25 03:34:48 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 03:34:50 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 03:34:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.480 (0.480) Loss 0.5356 (0.5356) Acc@1 88.867 (88.867) Acc@5 98.486 (98.486) Mem 22339MB +[2024-07-25 03:34:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.8325 (0.6685) Acc@1 80.713 (85.645) Acc@5 96.289 (97.643) Mem 22339MB +[2024-07-25 03:34:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.9707 (0.7837) Acc@1 76.660 (82.436) Acc@5 94.189 (96.340) Mem 22339MB +[2024-07-25 03:34:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.174 Acc@5 96.305 +[2024-07-25 03:34:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.2% +[2024-07-25 03:34:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.842 (0.842) Loss 0.5000 (0.5000) Acc@1 89.014 (89.014) Acc@5 98.584 (98.584) Mem 22339MB +[2024-07-25 03:34:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.191) Loss 0.7920 (0.6301) Acc@1 80.957 (86.310) Acc@5 96.191 (97.696) Mem 22339MB +[2024-07-25 03:34:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.160) Loss 0.9141 (0.7383) Acc@1 77.148 (83.080) Acc@5 95.508 (96.640) Mem 22339MB +[2024-07-25 03:34:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.768 Acc@5 96.653 +[2024-07-25 03:34:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.8% +[2024-07-25 03:34:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.77% +[2024-07-25 03:34:57 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 03:34:58 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 03:34:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][0/625] eta 0:09:12 lr 0.000783 wd 0.0500 time 0.8847 (0.8847) data time 0.3625 (0.3625) model time 0.0000 (0.0000) loss 6.9518 (6.9518) grad_norm 2.5572 (2.5572) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:35:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][10/625] eta 0:06:10 lr 0.000783 wd 0.0500 time 0.5727 (0.6019) data time 0.0006 (0.0337) model time 0.0000 (0.0000) loss 7.8005 (7.6647) grad_norm 1.6660 (2.2060) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:35:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][20/625] eta 0:05:56 lr 0.000783 wd 0.0500 time 0.5724 (0.5892) data time 0.0006 (0.0180) model time 0.0000 (0.0000) loss 8.1430 (7.6218) grad_norm 3.6171 (2.1739) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:35:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][30/625] eta 0:05:47 lr 0.000783 wd 0.0500 time 0.5733 (0.5848) data time 0.0006 (0.0125) model time 0.0000 (0.0000) loss 7.0756 (7.6050) grad_norm 2.2650 (2.2752) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:35:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][40/625] eta 0:05:41 lr 0.000783 wd 0.0500 time 0.5767 (0.5845) data time 0.0006 (0.0096) model time 0.0000 (0.0000) loss 7.5094 (7.4653) grad_norm 1.6499 (2.2179) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:35:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][50/625] eta 0:05:35 lr 0.000783 wd 0.0500 time 0.6149 (0.5836) data time 0.0006 (0.0079) model time 0.0000 (0.0000) loss 7.6036 (7.5869) grad_norm 1.5841 (2.2206) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:35:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][60/625] eta 0:05:29 lr 0.000783 wd 0.0500 time 0.6497 (0.5832) data time 0.0008 (0.0067) model time 0.6489 (0.5805) loss 7.1615 (7.5287) grad_norm 2.7484 (2.1918) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:35:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][70/625] eta 0:05:26 lr 0.000782 wd 0.0500 time 0.5615 (0.5888) data time 0.0008 (0.0059) model time 0.5607 (0.6012) loss 8.0132 (7.5374) grad_norm 3.2400 (2.1842) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:35:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][80/625] eta 0:05:22 lr 0.000782 wd 0.0500 time 0.7025 (0.5920) data time 0.0006 (0.0053) model time 0.7019 (0.6056) loss 7.6574 (7.5676) grad_norm 2.3218 (2.2171) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:35:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][90/625] eta 0:05:19 lr 0.000782 wd 0.0500 time 0.7682 (0.5974) data time 0.0006 (0.0048) model time 0.7676 (0.6143) loss 7.6525 (7.5561) grad_norm 5.1410 (2.3207) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:35:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][100/625] eta 0:05:13 lr 0.000782 wd 0.0500 time 0.5719 (0.5965) data time 0.0008 (0.0044) model time 0.5711 (0.6088) loss 8.8374 (7.5382) grad_norm 2.2531 (2.3450) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:36:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][110/625] eta 0:05:06 lr 0.000782 wd 0.0500 time 0.5758 (0.5945) data time 0.0008 (0.0040) model time 0.5750 (0.6031) loss 7.4898 (7.5145) grad_norm 1.8615 (2.3111) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:36:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][120/625] eta 0:04:59 lr 0.000782 wd 0.0500 time 0.5638 (0.5930) data time 0.0006 (0.0038) model time 0.5632 (0.5991) loss 7.8793 (7.5616) grad_norm 2.1163 (2.2795) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:36:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][130/625] eta 0:04:52 lr 0.000782 wd 0.0500 time 0.5703 (0.5916) data time 0.0008 (0.0035) model time 0.5695 (0.5960) loss 6.6563 (7.5389) grad_norm 2.2856 (2.2606) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:36:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][140/625] eta 0:04:46 lr 0.000782 wd 0.0500 time 0.5794 (0.5905) data time 0.0009 (0.0034) model time 0.5785 (0.5936) loss 7.6785 (7.5070) grad_norm 1.8612 (2.2687) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:36:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][150/625] eta 0:04:39 lr 0.000782 wd 0.0500 time 0.5684 (0.5895) data time 0.0006 (0.0032) model time 0.5678 (0.5916) loss 8.2069 (7.5125) grad_norm 2.2564 (2.2617) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:36:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][160/625] eta 0:04:33 lr 0.000782 wd 0.0500 time 0.5745 (0.5886) data time 0.0008 (0.0031) model time 0.5737 (0.5900) loss 5.7245 (7.5107) grad_norm 1.6696 (2.2352) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:36:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][170/625] eta 0:04:27 lr 0.000781 wd 0.0500 time 0.5751 (0.5881) data time 0.0007 (0.0029) model time 0.5743 (0.5892) loss 7.8472 (7.5222) grad_norm 2.5562 (2.2287) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:36:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][180/625] eta 0:04:21 lr 0.000781 wd 0.0500 time 0.5717 (0.5875) data time 0.0007 (0.0028) model time 0.5709 (0.5882) loss 7.1193 (7.5303) grad_norm 2.2533 (2.2302) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:36:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][190/625] eta 0:04:15 lr 0.000781 wd 0.0500 time 0.5707 (0.5868) data time 0.0008 (0.0027) model time 0.5699 (0.5872) loss 7.0858 (7.5092) grad_norm 2.5878 (2.2190) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:36:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][200/625] eta 0:04:09 lr 0.000781 wd 0.0500 time 0.5745 (0.5863) data time 0.0008 (0.0026) model time 0.5737 (0.5864) loss 8.8440 (7.5064) grad_norm 2.3117 (2.2285) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:37:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][210/625] eta 0:04:03 lr 0.000781 wd 0.0500 time 0.5701 (0.5857) data time 0.0007 (0.0025) model time 0.5694 (0.5856) loss 8.4337 (7.5289) grad_norm 2.3840 (2.2215) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:37:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][220/625] eta 0:03:57 lr 0.000781 wd 0.0500 time 0.5723 (0.5854) data time 0.0006 (0.0024) model time 0.5718 (0.5851) loss 5.6549 (7.5452) grad_norm 1.6528 (2.2387) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:37:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][230/625] eta 0:03:51 lr 0.000781 wd 0.0500 time 0.5697 (0.5849) data time 0.0009 (0.0024) model time 0.5688 (0.5844) loss 8.0230 (7.5373) grad_norm 1.8980 (2.2356) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:37:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][240/625] eta 0:03:45 lr 0.000781 wd 0.0500 time 0.5738 (0.5845) data time 0.0008 (0.0023) model time 0.5730 (0.5839) loss 8.5390 (7.5495) grad_norm 1.4460 (2.2237) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:37:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][250/625] eta 0:03:39 lr 0.000781 wd 0.0500 time 0.5625 (0.5843) data time 0.0006 (0.0023) model time 0.5619 (0.5836) loss 6.7983 (7.5525) grad_norm 3.0394 (2.2296) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:37:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][260/625] eta 0:03:33 lr 0.000781 wd 0.0500 time 0.5725 (0.5842) data time 0.0006 (0.0022) model time 0.5719 (0.5835) loss 6.8003 (7.5477) grad_norm 1.7211 (2.2313) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:37:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][270/625] eta 0:03:27 lr 0.000780 wd 0.0500 time 0.5777 (0.5841) data time 0.0006 (0.0022) model time 0.5770 (0.5833) loss 8.3156 (7.5433) grad_norm 1.8345 (2.2266) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:37:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][280/625] eta 0:03:21 lr 0.000780 wd 0.0500 time 0.7606 (0.5847) data time 0.0006 (0.0021) model time 0.7600 (0.5841) loss 7.0734 (7.5606) grad_norm 2.7593 (2.2308) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:37:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][290/625] eta 0:03:16 lr 0.000780 wd 0.0500 time 0.7160 (0.5866) data time 0.0006 (0.0021) model time 0.7154 (0.5864) loss 6.3718 (7.5652) grad_norm 3.4272 (2.2262) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:37:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][300/625] eta 0:03:10 lr 0.000780 wd 0.0500 time 0.7053 (0.5876) data time 0.0006 (0.0020) model time 0.7047 (0.5876) loss 8.2292 (7.5608) grad_norm 3.6000 (2.2557) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:38:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][310/625] eta 0:03:05 lr 0.000780 wd 0.0500 time 0.5730 (0.5892) data time 0.0008 (0.0020) model time 0.5722 (0.5895) loss 9.0986 (7.5493) grad_norm 1.9285 (2.2667) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:38:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][320/625] eta 0:02:59 lr 0.000780 wd 0.0500 time 0.5735 (0.5895) data time 0.0007 (0.0020) model time 0.5728 (0.5898) loss 6.8515 (7.5515) grad_norm 2.1271 (2.2606) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:38:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][330/625] eta 0:02:53 lr 0.000780 wd 0.0500 time 0.5692 (0.5892) data time 0.0008 (0.0019) model time 0.5685 (0.5894) loss 5.6268 (7.5480) grad_norm 2.9058 (2.2838) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:38:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][340/625] eta 0:02:47 lr 0.000780 wd 0.0500 time 0.5724 (0.5888) data time 0.0008 (0.0019) model time 0.5716 (0.5889) loss 8.8580 (7.5417) grad_norm 2.3738 (2.3346) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:38:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][350/625] eta 0:02:41 lr 0.000780 wd 0.0500 time 0.5717 (0.5884) data time 0.0007 (0.0019) model time 0.5710 (0.5883) loss 5.9032 (7.5274) grad_norm 2.9622 (2.3641) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:38:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][360/625] eta 0:02:35 lr 0.000780 wd 0.0500 time 0.5664 (0.5880) data time 0.0008 (0.0018) model time 0.5656 (0.5879) loss 7.1120 (7.5172) grad_norm 1.9354 (2.3643) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:38:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][370/625] eta 0:02:29 lr 0.000779 wd 0.0500 time 0.5706 (0.5877) data time 0.0008 (0.0018) model time 0.5698 (0.5875) loss 7.0726 (7.5105) grad_norm 2.8926 (2.3666) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 03:38:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][380/625] eta 0:02:23 lr 0.000779 wd 0.0500 time 0.5709 (0.5874) data time 0.0008 (0.0018) model time 0.5701 (0.5871) loss 8.3346 (7.5226) grad_norm 1.9495 (2.3620) loss_scale 2048.0000 (1040.1260) mem 22339MB +[2024-07-25 03:38:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][390/625] eta 0:02:17 lr 0.000779 wd 0.0500 time 0.5724 (0.5870) data time 0.0006 (0.0018) model time 0.5718 (0.5867) loss 6.3045 (7.5323) grad_norm 2.0833 (2.3804) loss_scale 2048.0000 (1065.9028) mem 22339MB +[2024-07-25 03:38:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][400/625] eta 0:02:12 lr 0.000779 wd 0.0500 time 0.5721 (0.5867) data time 0.0008 (0.0017) model time 0.5713 (0.5863) loss 9.1631 (7.5427) grad_norm 2.7723 (2.3795) loss_scale 2048.0000 (1090.3940) mem 22339MB +[2024-07-25 03:39:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][410/625] eta 0:02:06 lr 0.000779 wd 0.0500 time 0.5698 (0.5865) data time 0.0008 (0.0017) model time 0.5689 (0.5860) loss 8.9589 (7.5428) grad_norm 2.3571 (2.3885) loss_scale 2048.0000 (1113.6934) mem 22339MB +[2024-07-25 03:39:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][420/625] eta 0:02:00 lr 0.000779 wd 0.0500 time 0.5745 (0.5862) data time 0.0008 (0.0017) model time 0.5737 (0.5857) loss 7.3902 (7.5372) grad_norm 2.4715 (2.3923) loss_scale 2048.0000 (1135.8860) mem 22339MB +[2024-07-25 03:39:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][430/625] eta 0:01:54 lr 0.000779 wd 0.0500 time 0.5728 (0.5859) data time 0.0010 (0.0017) model time 0.5718 (0.5854) loss 7.3503 (7.5326) grad_norm 2.2687 (2.3994) loss_scale 2048.0000 (1157.0487) mem 22339MB +[2024-07-25 03:39:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][440/625] eta 0:01:48 lr 0.000779 wd 0.0500 time 0.5759 (0.5857) data time 0.0008 (0.0017) model time 0.5751 (0.5851) loss 7.8389 (7.5341) grad_norm 2.1092 (2.3954) loss_scale 2048.0000 (1177.2517) mem 22339MB +[2024-07-25 03:39:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][450/625] eta 0:01:42 lr 0.000779 wd 0.0500 time 0.5739 (0.5854) data time 0.0006 (0.0016) model time 0.5732 (0.5848) loss 7.8268 (7.5396) grad_norm 2.2210 (2.3928) loss_scale 2048.0000 (1196.5588) mem 22339MB +[2024-07-25 03:39:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][460/625] eta 0:01:36 lr 0.000779 wd 0.0500 time 0.5704 (0.5852) data time 0.0009 (0.0016) model time 0.5695 (0.5845) loss 6.6984 (7.5371) grad_norm 2.1197 (2.3883) loss_scale 2048.0000 (1215.0282) mem 22339MB +[2024-07-25 03:39:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][470/625] eta 0:01:30 lr 0.000778 wd 0.0500 time 0.5740 (0.5850) data time 0.0008 (0.0016) model time 0.5732 (0.5843) loss 8.7565 (7.5272) grad_norm 1.7873 (2.3806) loss_scale 2048.0000 (1232.7134) mem 22339MB +[2024-07-25 03:39:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][480/625] eta 0:01:24 lr 0.000778 wd 0.0500 time 0.5622 (0.5852) data time 0.0008 (0.0016) model time 0.5614 (0.5846) loss 8.0020 (7.5270) grad_norm 1.9111 (2.3728) loss_scale 2048.0000 (1249.6632) mem 22339MB +[2024-07-25 03:39:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][490/625] eta 0:01:18 lr 0.000778 wd 0.0500 time 0.5648 (0.5850) data time 0.0006 (0.0016) model time 0.5642 (0.5844) loss 7.8861 (7.5337) grad_norm 1.7518 (2.3675) loss_scale 2048.0000 (1265.9226) mem 22339MB +[2024-07-25 03:39:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][500/625] eta 0:01:13 lr 0.000778 wd 0.0500 time 0.5662 (0.5853) data time 0.0008 (0.0016) model time 0.5654 (0.5847) loss 8.7230 (7.5358) grad_norm 1.5318 (2.3644) loss_scale 2048.0000 (1281.5329) mem 22339MB +[2024-07-25 03:39:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][510/625] eta 0:01:07 lr 0.000778 wd 0.0500 time 0.7573 (0.5868) data time 0.0007 (0.0015) model time 0.7566 (0.5863) loss 7.6191 (7.5341) grad_norm 2.0447 (2.3628) loss_scale 2048.0000 (1296.5323) mem 22339MB +[2024-07-25 03:40:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][520/625] eta 0:01:01 lr 0.000778 wd 0.0500 time 0.7684 (0.5878) data time 0.0006 (0.0015) model time 0.7678 (0.5874) loss 7.9399 (7.5417) grad_norm 3.0806 (2.3625) loss_scale 2048.0000 (1310.9559) mem 22339MB +[2024-07-25 03:40:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][530/625] eta 0:00:55 lr 0.000778 wd 0.0500 time 0.5736 (0.5887) data time 0.0006 (0.0015) model time 0.5729 (0.5884) loss 6.3762 (7.5421) grad_norm 1.7671 (2.3578) loss_scale 2048.0000 (1324.8362) mem 22339MB +[2024-07-25 03:40:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][540/625] eta 0:00:50 lr 0.000778 wd 0.0500 time 0.5635 (0.5891) data time 0.0008 (0.0015) model time 0.5627 (0.5888) loss 6.2044 (7.5364) grad_norm 1.7601 (2.3518) loss_scale 2048.0000 (1338.2033) mem 22339MB +[2024-07-25 03:40:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][550/625] eta 0:00:44 lr 0.000778 wd 0.0500 time 0.5618 (0.5889) data time 0.0009 (0.0015) model time 0.5610 (0.5886) loss 6.5988 (7.5327) grad_norm 1.7462 (2.3468) loss_scale 2048.0000 (1351.0853) mem 22339MB +[2024-07-25 03:40:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][560/625] eta 0:00:38 lr 0.000777 wd 0.0500 time 0.5726 (0.5887) data time 0.0008 (0.0015) model time 0.5718 (0.5884) loss 9.0142 (7.5340) grad_norm 2.7924 (2.3439) loss_scale 2048.0000 (1363.5080) mem 22339MB +[2024-07-25 03:40:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][570/625] eta 0:00:32 lr 0.000777 wd 0.0500 time 0.5692 (0.5885) data time 0.0008 (0.0015) model time 0.5684 (0.5881) loss 7.3059 (7.5330) grad_norm 1.5922 (2.3363) loss_scale 2048.0000 (1375.4956) mem 22339MB +[2024-07-25 03:40:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][580/625] eta 0:00:26 lr 0.000777 wd 0.0500 time 0.5740 (0.5882) data time 0.0008 (0.0015) model time 0.5733 (0.5878) loss 8.2297 (7.5349) grad_norm 2.1162 (2.3334) loss_scale 2048.0000 (1387.0706) mem 22339MB +[2024-07-25 03:40:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][590/625] eta 0:00:20 lr 0.000777 wd 0.0500 time 0.5696 (0.5880) data time 0.0006 (0.0015) model time 0.5690 (0.5876) loss 8.3251 (7.5326) grad_norm 2.1436 (2.3343) loss_scale 2048.0000 (1398.2538) mem 22339MB +[2024-07-25 03:40:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][600/625] eta 0:00:14 lr 0.000777 wd 0.0500 time 0.5606 (0.5878) data time 0.0006 (0.0014) model time 0.5599 (0.5874) loss 6.6830 (7.5367) grad_norm 1.6815 (2.3450) loss_scale 2048.0000 (1409.0649) mem 22339MB +[2024-07-25 03:40:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][610/625] eta 0:00:08 lr 0.000777 wd 0.0500 time 0.5694 (0.5878) data time 0.0006 (0.0014) model time 0.5689 (0.5873) loss 7.1966 (7.5361) grad_norm 2.5198 (2.3424) loss_scale 2048.0000 (1419.5221) mem 22339MB +[2024-07-25 03:41:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [133/300][620/625] eta 0:00:02 lr 0.000777 wd 0.0500 time 0.5683 (0.5876) data time 0.0006 (0.0014) model time 0.5677 (0.5871) loss 7.2302 (7.5320) grad_norm 2.2384 (2.3465) loss_scale 2048.0000 (1429.6425) mem 22339MB +[2024-07-25 03:41:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 133 training takes 0:06:07 +[2024-07-25 03:41:06 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 03:41:07 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 03:41:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.470 (0.470) Loss 0.5244 (0.5244) Acc@1 88.867 (88.867) Acc@5 98.584 (98.584) Mem 22339MB +[2024-07-25 03:41:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.8325 (0.6591) Acc@1 80.469 (85.986) Acc@5 96.338 (97.612) Mem 22339MB +[2024-07-25 03:41:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.9937 (0.7789) Acc@1 76.709 (82.650) Acc@5 94.287 (96.322) Mem 22339MB +[2024-07-25 03:41:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.324 Acc@5 96.317 +[2024-07-25 03:41:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.3% +[2024-07-25 03:41:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 82.32% +[2024-07-25 03:41:11 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 03:41:12 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 03:41:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.469 (0.469) Loss 0.4995 (0.4995) Acc@1 89.111 (89.111) Acc@5 98.633 (98.633) Mem 22339MB +[2024-07-25 03:41:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.156) Loss 0.7891 (0.6293) Acc@1 81.006 (86.324) Acc@5 96.240 (97.701) Mem 22339MB +[2024-07-25 03:41:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9141 (0.7375) Acc@1 77.246 (83.096) Acc@5 95.557 (96.647) Mem 22339MB +[2024-07-25 03:41:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.778 Acc@5 96.657 +[2024-07-25 03:41:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.8% +[2024-07-25 03:41:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.78% +[2024-07-25 03:41:16 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 03:41:17 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 03:41:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][0/625] eta 0:08:39 lr 0.000777 wd 0.0500 time 0.8314 (0.8314) data time 0.3134 (0.3134) model time 0.0000 (0.0000) loss 7.1012 (7.1012) grad_norm 1.8325 (1.8325) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:41:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][10/625] eta 0:06:06 lr 0.000777 wd 0.0500 time 0.5740 (0.5964) data time 0.0008 (0.0293) model time 0.0000 (0.0000) loss 8.6675 (7.1971) grad_norm 1.5602 (2.0764) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:41:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][20/625] eta 0:05:55 lr 0.000777 wd 0.0500 time 0.5701 (0.5872) data time 0.0007 (0.0157) model time 0.0000 (0.0000) loss 9.0728 (7.3585) grad_norm 2.5956 (2.1427) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:41:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][30/625] eta 0:05:46 lr 0.000777 wd 0.0500 time 0.5687 (0.5827) data time 0.0009 (0.0109) model time 0.0000 (0.0000) loss 7.3912 (7.4140) grad_norm 2.7483 (2.4046) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:41:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][40/625] eta 0:05:40 lr 0.000776 wd 0.0500 time 0.5745 (0.5815) data time 0.0007 (0.0086) model time 0.0000 (0.0000) loss 8.7640 (7.5774) grad_norm 3.1294 (2.4423) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:41:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][50/625] eta 0:05:33 lr 0.000776 wd 0.0500 time 0.5695 (0.5801) data time 0.0007 (0.0071) model time 0.0000 (0.0000) loss 8.0726 (7.6061) grad_norm 2.2023 (2.4520) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:41:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][60/625] eta 0:05:27 lr 0.000776 wd 0.0500 time 0.5619 (0.5798) data time 0.0009 (0.0060) model time 0.5610 (0.5777) loss 8.7044 (7.5455) grad_norm 2.9311 (2.4734) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:41:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][70/625] eta 0:05:21 lr 0.000776 wd 0.0500 time 0.5721 (0.5792) data time 0.0006 (0.0053) model time 0.5715 (0.5760) loss 8.2206 (7.5134) grad_norm 1.8241 (2.4826) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:42:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][80/625] eta 0:05:15 lr 0.000776 wd 0.0500 time 0.5727 (0.5786) data time 0.0006 (0.0048) model time 0.5721 (0.5751) loss 7.9749 (7.4821) grad_norm 2.1611 (2.5004) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:42:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][90/625] eta 0:05:10 lr 0.000776 wd 0.0500 time 0.7091 (0.5797) data time 0.0010 (0.0044) model time 0.7082 (0.5782) loss 6.4993 (7.5000) grad_norm 2.8403 (2.5221) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:42:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][100/625] eta 0:05:05 lr 0.000776 wd 0.0500 time 0.5719 (0.5820) data time 0.0006 (0.0040) model time 0.5713 (0.5830) loss 8.2029 (7.4675) grad_norm 3.0578 (2.5119) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:42:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][110/625] eta 0:05:02 lr 0.000776 wd 0.0500 time 0.5759 (0.5880) data time 0.0008 (0.0037) model time 0.5751 (0.5938) loss 9.2976 (7.4759) grad_norm 3.0211 (2.5342) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:42:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][120/625] eta 0:04:58 lr 0.000776 wd 0.0500 time 0.6950 (0.5919) data time 0.0008 (0.0035) model time 0.6942 (0.5996) loss 6.4866 (7.4544) grad_norm 1.7061 (2.5085) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:42:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][130/625] eta 0:04:54 lr 0.000776 wd 0.0500 time 0.5716 (0.5942) data time 0.0007 (0.0033) model time 0.5709 (0.6024) loss 7.5835 (7.4851) grad_norm 1.6420 (2.4627) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:42:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][140/625] eta 0:04:47 lr 0.000775 wd 0.0500 time 0.5747 (0.5938) data time 0.0008 (0.0031) model time 0.5739 (0.6007) loss 8.6884 (7.4774) grad_norm 1.5701 (2.4284) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:42:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][150/625] eta 0:04:41 lr 0.000775 wd 0.0500 time 0.5720 (0.5925) data time 0.0006 (0.0029) model time 0.5713 (0.5981) loss 6.8336 (7.4612) grad_norm 2.6878 (2.4324) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:42:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][160/625] eta 0:04:35 lr 0.000775 wd 0.0500 time 0.5718 (0.5914) data time 0.0007 (0.0028) model time 0.5711 (0.5958) loss 8.4246 (7.4792) grad_norm 2.5579 (2.4301) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:42:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][170/625] eta 0:04:28 lr 0.000775 wd 0.0500 time 0.5679 (0.5904) data time 0.0009 (0.0027) model time 0.5670 (0.5940) loss 6.7292 (7.4860) grad_norm 2.2641 (2.4187) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:43:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][180/625] eta 0:04:22 lr 0.000775 wd 0.0500 time 0.5668 (0.5895) data time 0.0006 (0.0026) model time 0.5661 (0.5923) loss 8.0485 (7.4667) grad_norm 1.7651 (2.3935) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:43:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][190/625] eta 0:04:16 lr 0.000775 wd 0.0500 time 0.5798 (0.5887) data time 0.0006 (0.0025) model time 0.5791 (0.5909) loss 7.1247 (7.4735) grad_norm 1.9315 (2.3659) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:43:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][200/625] eta 0:04:10 lr 0.000775 wd 0.0500 time 0.5722 (0.5887) data time 0.0008 (0.0024) model time 0.5714 (0.5907) loss 9.1988 (7.4904) grad_norm 1.8917 (2.3556) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:43:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][210/625] eta 0:04:04 lr 0.000775 wd 0.0500 time 0.5757 (0.5880) data time 0.0007 (0.0023) model time 0.5750 (0.5897) loss 6.6751 (7.5021) grad_norm 2.9726 (2.3431) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:43:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][220/625] eta 0:03:57 lr 0.000775 wd 0.0500 time 0.5744 (0.5875) data time 0.0006 (0.0023) model time 0.5738 (0.5888) loss 6.8112 (7.4914) grad_norm 3.1606 (2.3629) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:43:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][230/625] eta 0:03:51 lr 0.000774 wd 0.0500 time 0.5692 (0.5869) data time 0.0007 (0.0022) model time 0.5685 (0.5880) loss 7.3711 (7.4942) grad_norm 1.7558 (2.3616) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:43:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][240/625] eta 0:03:45 lr 0.000774 wd 0.0500 time 0.5694 (0.5863) data time 0.0008 (0.0021) model time 0.5685 (0.5872) loss 7.1326 (7.4825) grad_norm 1.6580 (2.3512) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:43:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][250/625] eta 0:03:39 lr 0.000774 wd 0.0500 time 0.5698 (0.5859) data time 0.0006 (0.0021) model time 0.5692 (0.5866) loss 5.8055 (7.4795) grad_norm 4.6483 (2.3593) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:43:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][260/625] eta 0:03:33 lr 0.000774 wd 0.0500 time 0.5741 (0.5856) data time 0.0009 (0.0020) model time 0.5732 (0.5861) loss 7.7672 (7.4779) grad_norm 1.7829 (2.3438) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:43:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][270/625] eta 0:03:27 lr 0.000774 wd 0.0500 time 0.5742 (0.5852) data time 0.0006 (0.0020) model time 0.5735 (0.5855) loss 7.4034 (7.4802) grad_norm 1.8769 (2.3356) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:44:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][280/625] eta 0:03:21 lr 0.000774 wd 0.0500 time 0.5753 (0.5848) data time 0.0008 (0.0020) model time 0.5745 (0.5850) loss 9.0260 (7.4875) grad_norm 1.9723 (2.3410) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:44:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][290/625] eta 0:03:15 lr 0.000774 wd 0.0500 time 0.5705 (0.5845) data time 0.0008 (0.0019) model time 0.5697 (0.5846) loss 9.0573 (7.4876) grad_norm 1.8238 (2.3412) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:44:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][300/625] eta 0:03:09 lr 0.000774 wd 0.0500 time 0.5759 (0.5841) data time 0.0007 (0.0019) model time 0.5751 (0.5841) loss 8.4735 (7.4994) grad_norm 2.7641 (2.3447) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:44:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][310/625] eta 0:03:03 lr 0.000774 wd 0.0500 time 0.5729 (0.5840) data time 0.0006 (0.0018) model time 0.5723 (0.5839) loss 6.2188 (7.4982) grad_norm 3.2178 (2.3885) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:44:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][320/625] eta 0:02:58 lr 0.000774 wd 0.0500 time 0.6858 (0.5848) data time 0.0007 (0.0018) model time 0.6851 (0.5849) loss 8.9799 (7.5103) grad_norm 1.9974 (2.4020) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:44:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][330/625] eta 0:02:52 lr 0.000773 wd 0.0500 time 0.7218 (0.5863) data time 0.0006 (0.0018) model time 0.7212 (0.5866) loss 5.9648 (7.5116) grad_norm 2.4230 (2.4066) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:44:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][340/625] eta 0:02:47 lr 0.000773 wd 0.0500 time 0.6624 (0.5870) data time 0.0006 (0.0018) model time 0.6618 (0.5873) loss 7.8371 (7.5177) grad_norm 2.2609 (2.4034) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:44:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][350/625] eta 0:02:41 lr 0.000773 wd 0.0500 time 0.5643 (0.5883) data time 0.0007 (0.0017) model time 0.5636 (0.5889) loss 7.6660 (7.5318) grad_norm 2.3283 (2.4024) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:44:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][360/625] eta 0:02:35 lr 0.000773 wd 0.0500 time 0.5739 (0.5879) data time 0.0006 (0.0017) model time 0.5733 (0.5883) loss 6.1719 (7.5361) grad_norm 3.7409 (2.3911) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:44:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][370/625] eta 0:02:29 lr 0.000773 wd 0.0500 time 0.5707 (0.5877) data time 0.0008 (0.0017) model time 0.5699 (0.5881) loss 6.6471 (7.5416) grad_norm 2.0373 (2.3810) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:45:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][380/625] eta 0:02:23 lr 0.000773 wd 0.0500 time 0.5719 (0.5874) data time 0.0006 (0.0017) model time 0.5713 (0.5877) loss 8.7156 (7.5533) grad_norm 1.6152 (2.3813) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:45:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][390/625] eta 0:02:17 lr 0.000773 wd 0.0500 time 0.5720 (0.5871) data time 0.0008 (0.0016) model time 0.5712 (0.5873) loss 7.1002 (7.5471) grad_norm 2.2831 (2.3788) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:45:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][400/625] eta 0:02:12 lr 0.000773 wd 0.0500 time 0.5718 (0.5868) data time 0.0008 (0.0016) model time 0.5710 (0.5870) loss 7.8174 (7.5408) grad_norm 2.6692 (2.3684) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:45:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][410/625] eta 0:02:06 lr 0.000773 wd 0.0500 time 0.5742 (0.5866) data time 0.0008 (0.0016) model time 0.5734 (0.5867) loss 6.5248 (7.5281) grad_norm 1.8527 (2.3562) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:45:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][420/625] eta 0:02:00 lr 0.000773 wd 0.0500 time 0.5631 (0.5864) data time 0.0008 (0.0016) model time 0.5623 (0.5865) loss 8.1707 (7.5293) grad_norm 1.4735 (2.3573) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:45:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][430/625] eta 0:01:54 lr 0.000772 wd 0.0500 time 0.5660 (0.5862) data time 0.0006 (0.0016) model time 0.5654 (0.5862) loss 6.4676 (7.5224) grad_norm 4.6405 (2.3709) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:45:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][440/625] eta 0:01:48 lr 0.000772 wd 0.0500 time 0.5660 (0.5859) data time 0.0008 (0.0016) model time 0.5652 (0.5859) loss 7.7417 (7.5214) grad_norm 2.0502 (2.3679) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:45:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][450/625] eta 0:01:42 lr 0.000772 wd 0.0500 time 0.5650 (0.5857) data time 0.0008 (0.0015) model time 0.5642 (0.5856) loss 7.7314 (7.5247) grad_norm 1.7691 (2.3679) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:45:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][460/625] eta 0:01:36 lr 0.000772 wd 0.0500 time 0.5628 (0.5855) data time 0.0009 (0.0015) model time 0.5619 (0.5854) loss 8.4740 (7.5198) grad_norm 2.1048 (2.3752) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:45:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][470/625] eta 0:01:30 lr 0.000772 wd 0.0500 time 0.5686 (0.5854) data time 0.0007 (0.0015) model time 0.5680 (0.5852) loss 6.9189 (7.5241) grad_norm 4.0873 (2.3895) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:45:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][480/625] eta 0:01:24 lr 0.000772 wd 0.0500 time 0.5719 (0.5852) data time 0.0008 (0.0015) model time 0.5711 (0.5850) loss 8.1597 (7.5301) grad_norm 2.2333 (2.3877) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:46:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][490/625] eta 0:01:18 lr 0.000772 wd 0.0500 time 0.5703 (0.5850) data time 0.0008 (0.0015) model time 0.5695 (0.5847) loss 8.5897 (7.5250) grad_norm 1.7480 (2.3780) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:46:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][500/625] eta 0:01:13 lr 0.000772 wd 0.0500 time 0.5706 (0.5848) data time 0.0010 (0.0015) model time 0.5696 (0.5845) loss 7.0535 (7.5331) grad_norm 2.3233 (2.3830) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:46:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][510/625] eta 0:01:07 lr 0.000772 wd 0.0500 time 0.5705 (0.5846) data time 0.0008 (0.0014) model time 0.5696 (0.5843) loss 6.3900 (7.5347) grad_norm 1.7743 (2.3806) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:46:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][520/625] eta 0:01:01 lr 0.000772 wd 0.0500 time 0.5657 (0.5844) data time 0.0006 (0.0014) model time 0.5650 (0.5841) loss 6.3989 (7.5326) grad_norm 2.4035 (2.3892) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:46:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][530/625] eta 0:00:55 lr 0.000771 wd 0.0500 time 0.5699 (0.5844) data time 0.0006 (0.0014) model time 0.5693 (0.5841) loss 7.9895 (7.5301) grad_norm 2.0220 (2.3806) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:46:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][540/625] eta 0:00:49 lr 0.000771 wd 0.0500 time 0.7448 (0.5852) data time 0.0006 (0.0014) model time 0.7442 (0.5849) loss 7.5885 (7.5282) grad_norm 1.7852 (2.3721) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:46:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][550/625] eta 0:00:43 lr 0.000771 wd 0.0500 time 0.5681 (0.5858) data time 0.0006 (0.0014) model time 0.5675 (0.5856) loss 7.1145 (7.5279) grad_norm 1.6794 (2.3623) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:46:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][560/625] eta 0:00:38 lr 0.000771 wd 0.0500 time 0.7241 (0.5867) data time 0.0006 (0.0014) model time 0.7235 (0.5865) loss 5.9045 (7.5247) grad_norm 2.4732 (2.3597) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:46:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][570/625] eta 0:00:32 lr 0.000771 wd 0.0500 time 0.7662 (0.5875) data time 0.0006 (0.0014) model time 0.7656 (0.5874) loss 8.3970 (7.5253) grad_norm 1.7696 (2.3633) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:46:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][580/625] eta 0:00:26 lr 0.000771 wd 0.0500 time 0.5734 (0.5873) data time 0.0008 (0.0014) model time 0.5726 (0.5872) loss 6.1592 (7.5197) grad_norm 2.5670 (2.3635) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:47:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][590/625] eta 0:00:20 lr 0.000771 wd 0.0500 time 0.5725 (0.5871) data time 0.0006 (0.0014) model time 0.5719 (0.5869) loss 8.0841 (7.5309) grad_norm 3.0595 (2.3617) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:47:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][600/625] eta 0:00:14 lr 0.000771 wd 0.0500 time 0.5726 (0.5868) data time 0.0006 (0.0013) model time 0.5720 (0.5867) loss 6.9984 (7.5285) grad_norm 3.1866 (2.3657) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:47:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][610/625] eta 0:00:08 lr 0.000771 wd 0.0500 time 0.5715 (0.5867) data time 0.0004 (0.0013) model time 0.5711 (0.5864) loss 7.7727 (7.5295) grad_norm 2.6557 (2.3610) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:47:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [134/300][620/625] eta 0:00:02 lr 0.000770 wd 0.0500 time 0.5691 (0.5865) data time 0.0006 (0.0013) model time 0.5685 (0.5862) loss 5.7745 (7.5276) grad_norm 2.3427 (2.3554) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:47:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 134 training takes 0:06:06 +[2024-07-25 03:47:24 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 03:47:25 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 03:47:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.482 (0.482) Loss 0.5220 (0.5220) Acc@1 89.307 (89.307) Acc@5 98.828 (98.828) Mem 22339MB +[2024-07-25 03:47:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.159) Loss 0.8247 (0.6649) Acc@1 80.811 (85.889) Acc@5 95.654 (97.550) Mem 22339MB +[2024-07-25 03:47:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.143) Loss 0.9746 (0.7806) Acc@1 76.270 (82.622) Acc@5 94.824 (96.350) Mem 22339MB +[2024-07-25 03:47:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.198 Acc@5 96.315 +[2024-07-25 03:47:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.2% +[2024-07-25 03:47:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 1.006 (1.006) Loss 0.4995 (0.4995) Acc@1 89.258 (89.258) Acc@5 98.633 (98.633) Mem 22339MB +[2024-07-25 03:47:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.206) Loss 0.7871 (0.6293) Acc@1 81.152 (86.359) Acc@5 96.240 (97.714) Mem 22339MB +[2024-07-25 03:47:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.167) Loss 0.9146 (0.7372) Acc@1 77.441 (83.133) Acc@5 95.459 (96.649) Mem 22339MB +[2024-07-25 03:47:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.808 Acc@5 96.655 +[2024-07-25 03:47:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.8% +[2024-07-25 03:47:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.81% +[2024-07-25 03:47:33 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 03:47:34 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 03:47:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][0/625] eta 0:09:29 lr 0.000770 wd 0.0500 time 0.9111 (0.9111) data time 0.3937 (0.3937) model time 0.0000 (0.0000) loss 6.8515 (6.8515) grad_norm 2.0698 (2.0698) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:47:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][10/625] eta 0:06:11 lr 0.000770 wd 0.0500 time 0.5735 (0.6046) data time 0.0008 (0.0365) model time 0.0000 (0.0000) loss 7.0649 (6.7025) grad_norm 2.3717 (2.3151) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:47:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][20/625] eta 0:05:57 lr 0.000770 wd 0.0500 time 0.5724 (0.5906) data time 0.0007 (0.0196) model time 0.0000 (0.0000) loss 8.4131 (7.1312) grad_norm 4.3166 (2.4901) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:47:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][30/625] eta 0:05:48 lr 0.000770 wd 0.0500 time 0.5707 (0.5858) data time 0.0006 (0.0135) model time 0.0000 (0.0000) loss 5.6231 (7.3210) grad_norm 1.7040 (2.5567) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:47:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][40/625] eta 0:05:41 lr 0.000770 wd 0.0500 time 0.5707 (0.5842) data time 0.0008 (0.0104) model time 0.0000 (0.0000) loss 7.2495 (7.3482) grad_norm 1.9302 (2.4179) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:48:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][50/625] eta 0:05:35 lr 0.000770 wd 0.0500 time 0.5682 (0.5826) data time 0.0008 (0.0089) model time 0.0000 (0.0000) loss 8.9878 (7.4223) grad_norm 2.1065 (2.3570) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:48:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][60/625] eta 0:05:28 lr 0.000770 wd 0.0500 time 0.5687 (0.5814) data time 0.0008 (0.0076) model time 0.5680 (0.5743) loss 6.6791 (7.3858) grad_norm 2.2286 (2.3889) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:48:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][70/625] eta 0:05:22 lr 0.000770 wd 0.0500 time 0.5672 (0.5809) data time 0.0006 (0.0066) model time 0.5666 (0.5755) loss 7.3259 (7.4281) grad_norm 2.2001 (2.5621) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:48:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][80/625] eta 0:05:16 lr 0.000770 wd 0.0500 time 0.5730 (0.5803) data time 0.0006 (0.0059) model time 0.5724 (0.5756) loss 6.0948 (7.4512) grad_norm 1.9988 (2.4989) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:48:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][90/625] eta 0:05:10 lr 0.000770 wd 0.0500 time 0.5702 (0.5797) data time 0.0007 (0.0053) model time 0.5695 (0.5752) loss 8.8104 (7.4477) grad_norm 1.7295 (2.4458) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:48:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][100/625] eta 0:05:04 lr 0.000769 wd 0.0500 time 0.5741 (0.5793) data time 0.0008 (0.0049) model time 0.5733 (0.5751) loss 8.1335 (7.4569) grad_norm 1.7864 (2.4356) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:48:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][110/625] eta 0:04:58 lr 0.000769 wd 0.0500 time 0.5638 (0.5789) data time 0.0008 (0.0045) model time 0.5630 (0.5749) loss 7.6694 (7.4731) grad_norm 3.0146 (2.4466) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:48:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][120/625] eta 0:04:52 lr 0.000769 wd 0.0500 time 0.5715 (0.5785) data time 0.0008 (0.0042) model time 0.5707 (0.5746) loss 8.3988 (7.4447) grad_norm 1.7814 (2.4189) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:48:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][130/625] eta 0:04:47 lr 0.000769 wd 0.0500 time 0.7105 (0.5801) data time 0.0008 (0.0040) model time 0.7097 (0.5777) loss 8.8310 (7.4792) grad_norm 1.8889 (2.3885) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:48:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][140/625] eta 0:04:43 lr 0.000769 wd 0.0500 time 0.5725 (0.5837) data time 0.0007 (0.0037) model time 0.5719 (0.5835) loss 8.3489 (7.5025) grad_norm 1.5520 (2.3643) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:49:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][150/625] eta 0:04:39 lr 0.000769 wd 0.0500 time 0.6154 (0.5882) data time 0.0008 (0.0035) model time 0.6146 (0.5903) loss 8.5968 (7.5037) grad_norm 1.7029 (2.3387) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:49:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][160/625] eta 0:04:35 lr 0.000769 wd 0.0500 time 0.7601 (0.5927) data time 0.0008 (0.0034) model time 0.7593 (0.5965) loss 6.7693 (7.4955) grad_norm 1.8993 (2.3220) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:49:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][170/625] eta 0:04:29 lr 0.000769 wd 0.0500 time 0.5712 (0.5931) data time 0.0006 (0.0032) model time 0.5706 (0.5967) loss 6.9785 (7.4868) grad_norm 3.4547 (2.3404) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:49:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][180/625] eta 0:04:23 lr 0.000769 wd 0.0500 time 0.5736 (0.5921) data time 0.0008 (0.0031) model time 0.5728 (0.5951) loss 6.4667 (7.4973) grad_norm 2.2192 (2.3126) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:49:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][190/625] eta 0:04:17 lr 0.000768 wd 0.0500 time 0.5717 (0.5914) data time 0.0008 (0.0030) model time 0.5709 (0.5938) loss 7.6158 (7.5024) grad_norm 2.5245 (2.3104) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:49:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][200/625] eta 0:04:11 lr 0.000768 wd 0.0500 time 0.5710 (0.5906) data time 0.0006 (0.0029) model time 0.5703 (0.5925) loss 7.6295 (7.4997) grad_norm 2.0214 (2.2990) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:49:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][210/625] eta 0:04:04 lr 0.000768 wd 0.0500 time 0.5750 (0.5899) data time 0.0006 (0.0028) model time 0.5743 (0.5915) loss 6.3247 (7.4844) grad_norm 2.0488 (2.2836) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:49:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][220/625] eta 0:03:58 lr 0.000768 wd 0.0500 time 0.5763 (0.5893) data time 0.0008 (0.0027) model time 0.5755 (0.5905) loss 6.1534 (7.4711) grad_norm 2.2036 (2.2689) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:49:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][230/625] eta 0:03:52 lr 0.000768 wd 0.0500 time 0.5716 (0.5887) data time 0.0008 (0.0026) model time 0.5708 (0.5896) loss 7.4636 (7.4704) grad_norm 1.9133 (2.2578) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:49:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][240/625] eta 0:03:46 lr 0.000768 wd 0.0500 time 0.5749 (0.5881) data time 0.0008 (0.0026) model time 0.5741 (0.5887) loss 7.0828 (7.4870) grad_norm 3.0985 (2.2527) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:50:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][250/625] eta 0:03:40 lr 0.000768 wd 0.0500 time 0.5682 (0.5876) data time 0.0006 (0.0025) model time 0.5676 (0.5880) loss 7.2069 (7.4811) grad_norm 2.3540 (2.2853) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:50:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][260/625] eta 0:03:34 lr 0.000768 wd 0.0500 time 0.5665 (0.5872) data time 0.0007 (0.0024) model time 0.5658 (0.5874) loss 6.3397 (7.4797) grad_norm 2.0865 (2.2760) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:50:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][270/625] eta 0:03:28 lr 0.000768 wd 0.0500 time 0.5732 (0.5868) data time 0.0006 (0.0024) model time 0.5726 (0.5868) loss 7.9579 (7.4715) grad_norm 1.9430 (2.2835) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:50:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][280/625] eta 0:03:22 lr 0.000768 wd 0.0500 time 0.5673 (0.5864) data time 0.0008 (0.0024) model time 0.5665 (0.5862) loss 6.3394 (7.4735) grad_norm 1.5995 (2.2906) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:50:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][290/625] eta 0:03:16 lr 0.000767 wd 0.0500 time 0.5780 (0.5860) data time 0.0009 (0.0023) model time 0.5771 (0.5858) loss 8.8866 (7.4770) grad_norm 2.7059 (2.3006) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:50:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][300/625] eta 0:03:10 lr 0.000767 wd 0.0500 time 0.5734 (0.5856) data time 0.0008 (0.0023) model time 0.5725 (0.5853) loss 7.8749 (7.4672) grad_norm 2.7001 (2.2900) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:50:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][310/625] eta 0:03:04 lr 0.000767 wd 0.0500 time 0.5728 (0.5852) data time 0.0006 (0.0023) model time 0.5722 (0.5848) loss 6.8122 (7.4605) grad_norm 2.6986 (2.2949) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:50:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][320/625] eta 0:02:58 lr 0.000767 wd 0.0500 time 0.5715 (0.5849) data time 0.0006 (0.0022) model time 0.5709 (0.5844) loss 7.1523 (7.4565) grad_norm 2.6865 (2.2980) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:50:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][330/625] eta 0:02:52 lr 0.000767 wd 0.0500 time 0.5747 (0.5846) data time 0.0008 (0.0022) model time 0.5739 (0.5841) loss 7.1314 (7.4634) grad_norm 1.7494 (2.2874) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:50:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][340/625] eta 0:02:46 lr 0.000767 wd 0.0500 time 0.7115 (0.5847) data time 0.0006 (0.0021) model time 0.7108 (0.5841) loss 7.8810 (7.4680) grad_norm 1.8357 (2.2801) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:51:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][350/625] eta 0:02:40 lr 0.000767 wd 0.0500 time 0.5755 (0.5844) data time 0.0009 (0.0021) model time 0.5746 (0.5838) loss 6.7290 (7.4772) grad_norm 2.1099 (2.2729) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:51:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][360/625] eta 0:02:35 lr 0.000767 wd 0.0500 time 0.7030 (0.5860) data time 0.0007 (0.0021) model time 0.7023 (0.5856) loss 8.2976 (7.4776) grad_norm 2.4826 (2.2726) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:51:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][370/625] eta 0:02:29 lr 0.000767 wd 0.0500 time 0.6947 (0.5875) data time 0.0009 (0.0020) model time 0.6938 (0.5873) loss 7.9580 (7.4827) grad_norm 2.0682 (2.2684) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:51:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][380/625] eta 0:02:24 lr 0.000767 wd 0.0500 time 0.7269 (0.5892) data time 0.0009 (0.0020) model time 0.7261 (0.5893) loss 7.3467 (7.4862) grad_norm 3.1218 (2.2818) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:51:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][390/625] eta 0:02:18 lr 0.000766 wd 0.0500 time 0.5736 (0.5897) data time 0.0008 (0.0020) model time 0.5727 (0.5899) loss 6.0187 (7.4906) grad_norm 3.3050 (2.2950) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:51:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][400/625] eta 0:02:12 lr 0.000766 wd 0.0500 time 0.5746 (0.5893) data time 0.0006 (0.0019) model time 0.5740 (0.5894) loss 6.6183 (7.4897) grad_norm 3.6341 (2.3081) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:51:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][410/625] eta 0:02:06 lr 0.000766 wd 0.0500 time 0.5725 (0.5892) data time 0.0008 (0.0019) model time 0.5716 (0.5891) loss 7.0985 (7.4885) grad_norm 2.1592 (2.3031) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:51:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][420/625] eta 0:02:00 lr 0.000766 wd 0.0500 time 0.5726 (0.5889) data time 0.0009 (0.0019) model time 0.5717 (0.5888) loss 6.8253 (7.4964) grad_norm 2.5895 (2.2962) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:51:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][430/625] eta 0:01:54 lr 0.000766 wd 0.0500 time 0.5719 (0.5885) data time 0.0006 (0.0019) model time 0.5712 (0.5884) loss 7.2097 (7.4961) grad_norm 1.8941 (2.2890) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:51:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][440/625] eta 0:01:48 lr 0.000766 wd 0.0500 time 0.5734 (0.5882) data time 0.0007 (0.0018) model time 0.5727 (0.5880) loss 6.0051 (7.4952) grad_norm 1.9102 (2.2870) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:52:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][450/625] eta 0:01:42 lr 0.000766 wd 0.0500 time 0.5630 (0.5879) data time 0.0008 (0.0018) model time 0.5621 (0.5877) loss 7.6412 (7.4914) grad_norm 1.5181 (2.2830) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:52:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][460/625] eta 0:01:36 lr 0.000766 wd 0.0500 time 0.5747 (0.5877) data time 0.0008 (0.0018) model time 0.5739 (0.5874) loss 7.3580 (7.4868) grad_norm 2.2668 (2.2825) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:52:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][470/625] eta 0:01:31 lr 0.000766 wd 0.0500 time 0.5718 (0.5875) data time 0.0007 (0.0018) model time 0.5712 (0.5871) loss 6.4743 (7.4881) grad_norm 2.1213 (2.2754) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:52:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][480/625] eta 0:01:25 lr 0.000766 wd 0.0500 time 0.5712 (0.5872) data time 0.0006 (0.0018) model time 0.5707 (0.5868) loss 6.2865 (7.4853) grad_norm 2.4205 (2.2694) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:52:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][490/625] eta 0:01:19 lr 0.000765 wd 0.0500 time 0.5625 (0.5870) data time 0.0008 (0.0018) model time 0.5618 (0.5866) loss 8.8879 (7.5013) grad_norm 1.5030 (2.2759) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:52:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][500/625] eta 0:01:13 lr 0.000765 wd 0.0500 time 0.5737 (0.5868) data time 0.0008 (0.0017) model time 0.5730 (0.5863) loss 6.5182 (7.5024) grad_norm 2.2989 (2.2791) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:52:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][510/625] eta 0:01:07 lr 0.000765 wd 0.0500 time 0.5660 (0.5865) data time 0.0006 (0.0017) model time 0.5654 (0.5860) loss 7.9100 (7.5076) grad_norm 1.6573 (2.2757) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:52:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][520/625] eta 0:01:01 lr 0.000765 wd 0.0500 time 0.5715 (0.5863) data time 0.0006 (0.0017) model time 0.5709 (0.5858) loss 7.8521 (7.5059) grad_norm 1.9829 (2.2671) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:52:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][530/625] eta 0:00:55 lr 0.000765 wd 0.0500 time 0.5730 (0.5861) data time 0.0006 (0.0017) model time 0.5724 (0.5856) loss 6.9874 (7.5082) grad_norm 1.8667 (2.2623) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:52:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][540/625] eta 0:00:49 lr 0.000765 wd 0.0500 time 0.5709 (0.5859) data time 0.0006 (0.0017) model time 0.5703 (0.5854) loss 6.6319 (7.5053) grad_norm 1.7436 (2.2544) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:52:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][550/625] eta 0:00:43 lr 0.000765 wd 0.0500 time 0.5733 (0.5858) data time 0.0006 (0.0016) model time 0.5727 (0.5852) loss 8.3280 (7.5033) grad_norm 2.0684 (2.2483) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:53:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][560/625] eta 0:00:38 lr 0.000765 wd 0.0500 time 0.5723 (0.5856) data time 0.0008 (0.0016) model time 0.5715 (0.5850) loss 6.7919 (7.5032) grad_norm 2.9216 (2.2439) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:53:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][570/625] eta 0:00:32 lr 0.000765 wd 0.0500 time 0.5720 (0.5856) data time 0.0006 (0.0016) model time 0.5713 (0.5850) loss 7.4129 (7.5108) grad_norm 3.6103 (2.2422) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:53:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][580/625] eta 0:00:26 lr 0.000764 wd 0.0500 time 0.6987 (0.5867) data time 0.0008 (0.0016) model time 0.6978 (0.5862) loss 6.8278 (7.5159) grad_norm 2.2674 (2.2464) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:53:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][590/625] eta 0:00:20 lr 0.000764 wd 0.0500 time 0.5732 (0.5875) data time 0.0009 (0.0016) model time 0.5723 (0.5871) loss 7.4559 (7.5111) grad_norm 2.7361 (2.2578) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:53:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][600/625] eta 0:00:14 lr 0.000764 wd 0.0500 time 0.6888 (0.5886) data time 0.0006 (0.0016) model time 0.6882 (0.5882) loss 8.5227 (7.5208) grad_norm 1.7255 (2.2537) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:53:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][610/625] eta 0:00:08 lr 0.000764 wd 0.0500 time 0.5731 (0.5892) data time 0.0004 (0.0016) model time 0.5727 (0.5889) loss 5.9495 (7.5250) grad_norm 1.6763 (2.2618) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:53:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [135/300][620/625] eta 0:00:02 lr 0.000764 wd 0.0500 time 0.5707 (0.5890) data time 0.0004 (0.0015) model time 0.5703 (0.5887) loss 8.3368 (7.5276) grad_norm 2.7986 (2.2651) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:53:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 135 training takes 0:06:08 +[2024-07-25 03:53:43 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 03:53:44 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 03:53:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.476 (0.476) Loss 0.5083 (0.5083) Acc@1 89.746 (89.746) Acc@5 98.682 (98.682) Mem 22339MB +[2024-07-25 03:53:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.8252 (0.6535) Acc@1 80.322 (85.875) Acc@5 96.289 (97.559) Mem 22339MB +[2024-07-25 03:53:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9805 (0.7651) Acc@1 75.488 (82.726) Acc@5 94.775 (96.417) Mem 22339MB +[2024-07-25 03:53:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.352 Acc@5 96.399 +[2024-07-25 03:53:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.4% +[2024-07-25 03:53:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 82.35% +[2024-07-25 03:53:48 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 03:53:49 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 03:53:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.474 (0.474) Loss 0.4988 (0.4988) Acc@1 89.355 (89.355) Acc@5 98.633 (98.633) Mem 22339MB +[2024-07-25 03:53:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.7861 (0.6288) Acc@1 81.250 (86.386) Acc@5 96.289 (97.741) Mem 22339MB +[2024-07-25 03:53:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9131 (0.7364) Acc@1 77.344 (83.166) Acc@5 95.312 (96.670) Mem 22339MB +[2024-07-25 03:53:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.835 Acc@5 96.673 +[2024-07-25 03:53:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.8% +[2024-07-25 03:53:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.84% +[2024-07-25 03:53:53 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 03:53:54 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 03:53:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][0/625] eta 0:10:06 lr 0.000764 wd 0.0500 time 0.9706 (0.9706) data time 0.4519 (0.4519) model time 0.0000 (0.0000) loss 7.7101 (7.7101) grad_norm 1.8468 (1.8468) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:54:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][10/625] eta 0:06:14 lr 0.000764 wd 0.0500 time 0.5723 (0.6093) data time 0.0007 (0.0418) model time 0.0000 (0.0000) loss 7.9917 (7.2982) grad_norm 1.7918 (1.8891) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:54:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][20/625] eta 0:05:58 lr 0.000764 wd 0.0500 time 0.5710 (0.5922) data time 0.0006 (0.0222) model time 0.0000 (0.0000) loss 6.3043 (7.1836) grad_norm 1.4764 (1.9457) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:54:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][30/625] eta 0:05:48 lr 0.000764 wd 0.0500 time 0.5750 (0.5863) data time 0.0007 (0.0153) model time 0.0000 (0.0000) loss 6.4247 (7.2811) grad_norm 3.4825 (2.0965) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:54:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][40/625] eta 0:05:41 lr 0.000764 wd 0.0500 time 0.5696 (0.5834) data time 0.0006 (0.0117) model time 0.0000 (0.0000) loss 8.6945 (7.3932) grad_norm 1.7033 (2.1712) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:54:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][50/625] eta 0:05:34 lr 0.000764 wd 0.0500 time 0.5690 (0.5815) data time 0.0007 (0.0096) model time 0.0000 (0.0000) loss 7.7682 (7.3994) grad_norm 1.9427 (2.2338) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:54:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][60/625] eta 0:05:27 lr 0.000763 wd 0.0500 time 0.5672 (0.5802) data time 0.0008 (0.0081) model time 0.5664 (0.5727) loss 9.4835 (7.4878) grad_norm 1.5770 (2.1654) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:54:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][70/625] eta 0:05:21 lr 0.000763 wd 0.0500 time 0.5720 (0.5794) data time 0.0008 (0.0071) model time 0.5712 (0.5732) loss 7.8771 (7.4797) grad_norm 2.4234 (2.1463) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:54:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][80/625] eta 0:05:15 lr 0.000763 wd 0.0500 time 0.5674 (0.5785) data time 0.0006 (0.0063) model time 0.5669 (0.5728) loss 6.9966 (7.4946) grad_norm 2.4111 (2.1486) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:54:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][90/625] eta 0:05:09 lr 0.000763 wd 0.0500 time 0.5717 (0.5786) data time 0.0008 (0.0057) model time 0.5710 (0.5742) loss 6.8597 (7.5059) grad_norm 2.0129 (2.1448) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:54:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][100/625] eta 0:05:03 lr 0.000763 wd 0.0500 time 0.5718 (0.5784) data time 0.0008 (0.0052) model time 0.5710 (0.5745) loss 6.3747 (7.5010) grad_norm 1.6315 (2.1261) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:54:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][110/625] eta 0:04:57 lr 0.000763 wd 0.0500 time 0.5692 (0.5781) data time 0.0007 (0.0048) model time 0.5685 (0.5745) loss 7.8959 (7.5262) grad_norm 2.0523 (2.1350) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:55:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][120/625] eta 0:04:51 lr 0.000763 wd 0.0500 time 0.5729 (0.5778) data time 0.0006 (0.0045) model time 0.5723 (0.5744) loss 7.6434 (7.5406) grad_norm 2.5207 (2.1370) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:55:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][130/625] eta 0:04:46 lr 0.000763 wd 0.0500 time 0.5743 (0.5779) data time 0.0006 (0.0042) model time 0.5737 (0.5748) loss 7.2807 (7.5220) grad_norm 3.0087 (2.1714) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:55:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][140/625] eta 0:04:40 lr 0.000763 wd 0.0500 time 0.5632 (0.5780) data time 0.0008 (0.0040) model time 0.5624 (0.5752) loss 7.3111 (7.5183) grad_norm 2.5469 (2.2159) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:55:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][150/625] eta 0:04:34 lr 0.000762 wd 0.0500 time 0.5692 (0.5784) data time 0.0006 (0.0038) model time 0.5686 (0.5761) loss 8.5159 (7.5051) grad_norm 1.8808 (2.2493) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:55:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][160/625] eta 0:04:29 lr 0.000762 wd 0.0500 time 0.5704 (0.5793) data time 0.0006 (0.0036) model time 0.5698 (0.5775) loss 6.7677 (7.4946) grad_norm 1.9655 (2.2574) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:55:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][170/625] eta 0:04:23 lr 0.000762 wd 0.0500 time 0.5733 (0.5800) data time 0.0007 (0.0034) model time 0.5725 (0.5786) loss 7.1435 (7.5066) grad_norm 1.8630 (2.2480) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:55:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][180/625] eta 0:04:19 lr 0.000762 wd 0.0500 time 0.7108 (0.5835) data time 0.0007 (0.0033) model time 0.7101 (0.5836) loss 6.1788 (7.4829) grad_norm 1.9130 (2.2267) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:55:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][190/625] eta 0:04:14 lr 0.000762 wd 0.0500 time 0.5724 (0.5862) data time 0.0007 (0.0031) model time 0.5716 (0.5871) loss 7.8601 (7.4833) grad_norm 1.8623 (2.2061) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:55:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][200/625] eta 0:04:10 lr 0.000762 wd 0.0500 time 0.5614 (0.5893) data time 0.0006 (0.0030) model time 0.5609 (0.5912) loss 5.9639 (7.4445) grad_norm 2.9065 (2.1999) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:55:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][210/625] eta 0:04:04 lr 0.000762 wd 0.0500 time 0.5714 (0.5894) data time 0.0007 (0.0029) model time 0.5707 (0.5911) loss 8.4232 (7.4610) grad_norm 2.8066 (2.2270) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:56:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][220/625] eta 0:03:58 lr 0.000762 wd 0.0500 time 0.5723 (0.5887) data time 0.0006 (0.0028) model time 0.5717 (0.5901) loss 7.7162 (7.4689) grad_norm 2.8204 (2.2227) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:56:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][230/625] eta 0:03:52 lr 0.000762 wd 0.0500 time 0.5727 (0.5882) data time 0.0008 (0.0027) model time 0.5718 (0.5893) loss 8.2223 (7.4754) grad_norm 2.3590 (2.2540) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:56:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][240/625] eta 0:03:46 lr 0.000762 wd 0.0500 time 0.5755 (0.5876) data time 0.0006 (0.0027) model time 0.5749 (0.5885) loss 6.2098 (7.4865) grad_norm 2.0986 (2.2647) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:56:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][250/625] eta 0:03:40 lr 0.000761 wd 0.0500 time 0.5739 (0.5872) data time 0.0008 (0.0026) model time 0.5731 (0.5878) loss 7.4044 (7.4922) grad_norm 4.4717 (2.3256) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:56:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][260/625] eta 0:03:34 lr 0.000761 wd 0.0500 time 0.5730 (0.5867) data time 0.0008 (0.0025) model time 0.5722 (0.5871) loss 6.8778 (7.5097) grad_norm 2.1498 (2.3293) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:56:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][270/625] eta 0:03:28 lr 0.000761 wd 0.0500 time 0.5693 (0.5863) data time 0.0006 (0.0024) model time 0.5687 (0.5866) loss 6.5574 (7.5118) grad_norm 1.8141 (2.3189) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:56:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][280/625] eta 0:03:22 lr 0.000761 wd 0.0500 time 0.5648 (0.5858) data time 0.0007 (0.0024) model time 0.5641 (0.5860) loss 7.4116 (7.5213) grad_norm 4.3671 (2.3353) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:56:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][290/625] eta 0:03:16 lr 0.000761 wd 0.0500 time 0.5680 (0.5854) data time 0.0008 (0.0023) model time 0.5672 (0.5855) loss 7.0323 (7.5198) grad_norm 2.4996 (2.3307) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:56:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][300/625] eta 0:03:10 lr 0.000761 wd 0.0500 time 0.5733 (0.5851) data time 0.0006 (0.0023) model time 0.5727 (0.5851) loss 7.6060 (7.5208) grad_norm 2.7240 (2.3204) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:56:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][310/625] eta 0:03:04 lr 0.000761 wd 0.0500 time 0.5744 (0.5849) data time 0.0009 (0.0022) model time 0.5735 (0.5847) loss 8.1302 (7.5227) grad_norm 2.9665 (2.3271) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:57:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][320/625] eta 0:02:58 lr 0.000761 wd 0.0500 time 0.5648 (0.5846) data time 0.0006 (0.0022) model time 0.5642 (0.5844) loss 6.2940 (7.5220) grad_norm 1.6753 (2.3272) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:57:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][330/625] eta 0:02:52 lr 0.000761 wd 0.0500 time 0.5727 (0.5843) data time 0.0008 (0.0022) model time 0.5719 (0.5840) loss 7.6036 (7.5245) grad_norm 1.7835 (2.3251) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:57:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][340/625] eta 0:02:46 lr 0.000761 wd 0.0500 time 0.5726 (0.5841) data time 0.0006 (0.0021) model time 0.5720 (0.5837) loss 6.7153 (7.5281) grad_norm 1.8671 (2.3286) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:57:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][350/625] eta 0:02:40 lr 0.000760 wd 0.0500 time 0.5721 (0.5840) data time 0.0008 (0.0021) model time 0.5712 (0.5836) loss 7.7943 (7.5334) grad_norm 2.1150 (2.3347) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:57:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][360/625] eta 0:02:34 lr 0.000760 wd 0.0500 time 0.5787 (0.5838) data time 0.0008 (0.0021) model time 0.5779 (0.5833) loss 8.1340 (7.5301) grad_norm 2.8002 (2.3355) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:57:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][370/625] eta 0:02:28 lr 0.000760 wd 0.0500 time 0.5691 (0.5836) data time 0.0008 (0.0020) model time 0.5682 (0.5831) loss 7.6243 (7.5250) grad_norm 2.4420 (2.3615) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:57:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][380/625] eta 0:02:22 lr 0.000760 wd 0.0500 time 0.5714 (0.5835) data time 0.0008 (0.0020) model time 0.5706 (0.5830) loss 7.6132 (7.5166) grad_norm 1.7757 (2.3561) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:57:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][390/625] eta 0:02:17 lr 0.000760 wd 0.0500 time 0.6948 (0.5837) data time 0.0006 (0.0020) model time 0.6942 (0.5833) loss 7.9041 (7.5291) grad_norm 1.6888 (2.3494) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:57:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][400/625] eta 0:02:11 lr 0.000760 wd 0.0500 time 0.5762 (0.5849) data time 0.0008 (0.0019) model time 0.5755 (0.5846) loss 7.9777 (7.5240) grad_norm 2.8595 (2.3577) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:57:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][410/625] eta 0:02:05 lr 0.000760 wd 0.0500 time 0.5719 (0.5859) data time 0.0007 (0.0019) model time 0.5712 (0.5857) loss 8.4185 (7.5370) grad_norm 1.9981 (2.3579) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:58:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][420/625] eta 0:02:00 lr 0.000760 wd 0.0500 time 0.5712 (0.5875) data time 0.0006 (0.0019) model time 0.5707 (0.5875) loss 8.4929 (7.5238) grad_norm 2.2717 (2.3544) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:58:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][430/625] eta 0:01:54 lr 0.000760 wd 0.0500 time 0.5755 (0.5879) data time 0.0008 (0.0018) model time 0.5747 (0.5880) loss 7.1456 (7.5176) grad_norm 2.7568 (2.3558) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:58:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][440/625] eta 0:01:48 lr 0.000759 wd 0.0500 time 0.5766 (0.5876) data time 0.0006 (0.0018) model time 0.5760 (0.5876) loss 5.6778 (7.5123) grad_norm 1.6007 (2.3446) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:58:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][450/625] eta 0:01:42 lr 0.000759 wd 0.0500 time 0.5742 (0.5874) data time 0.0006 (0.0018) model time 0.5736 (0.5873) loss 6.0944 (7.5084) grad_norm 2.4369 (2.3422) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:58:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][460/625] eta 0:01:36 lr 0.000759 wd 0.0500 time 0.5724 (0.5871) data time 0.0007 (0.0018) model time 0.5717 (0.5870) loss 7.1379 (7.5091) grad_norm 2.0051 (2.3367) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:58:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][470/625] eta 0:01:30 lr 0.000759 wd 0.0500 time 0.5645 (0.5869) data time 0.0008 (0.0018) model time 0.5637 (0.5867) loss 6.3863 (7.5087) grad_norm 3.2898 (2.3373) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:58:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][480/625] eta 0:01:25 lr 0.000759 wd 0.0500 time 0.5710 (0.5867) data time 0.0008 (0.0017) model time 0.5702 (0.5865) loss 5.8311 (7.5070) grad_norm 2.0995 (2.3361) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:58:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][490/625] eta 0:01:19 lr 0.000759 wd 0.0500 time 0.5742 (0.5865) data time 0.0006 (0.0017) model time 0.5736 (0.5862) loss 6.5695 (7.5121) grad_norm 2.1802 (2.3332) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 03:58:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][500/625] eta 0:01:13 lr 0.000759 wd 0.0500 time 0.5615 (0.5863) data time 0.0006 (0.0017) model time 0.5609 (0.5860) loss 8.7605 (7.5135) grad_norm 1.7039 (2.3270) loss_scale 4096.0000 (2052.0878) mem 22339MB +[2024-07-25 03:58:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][510/625] eta 0:01:07 lr 0.000759 wd 0.0500 time 0.5750 (0.5861) data time 0.0009 (0.0017) model time 0.5741 (0.5858) loss 6.3582 (7.5163) grad_norm 1.6890 (2.3226) loss_scale 4096.0000 (2092.0861) mem 22339MB +[2024-07-25 03:58:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][520/625] eta 0:01:01 lr 0.000759 wd 0.0500 time 0.5732 (0.5859) data time 0.0008 (0.0017) model time 0.5724 (0.5855) loss 9.2080 (7.5155) grad_norm 3.1082 (2.3271) loss_scale 4096.0000 (2130.5489) mem 22339MB +[2024-07-25 03:59:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][530/625] eta 0:00:55 lr 0.000759 wd 0.0500 time 0.5627 (0.5857) data time 0.0006 (0.0016) model time 0.5621 (0.5853) loss 7.1554 (7.5184) grad_norm 1.9885 (2.3264) loss_scale 4096.0000 (2167.5631) mem 22339MB +[2024-07-25 03:59:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][540/625] eta 0:00:49 lr 0.000758 wd 0.0500 time 0.5722 (0.5855) data time 0.0008 (0.0016) model time 0.5713 (0.5851) loss 7.9626 (7.5232) grad_norm 2.0039 (2.3221) loss_scale 4096.0000 (2203.2089) mem 22339MB +[2024-07-25 03:59:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][550/625] eta 0:00:43 lr 0.000758 wd 0.0500 time 0.5677 (0.5853) data time 0.0008 (0.0016) model time 0.5669 (0.5849) loss 9.2235 (7.5266) grad_norm 4.4219 (2.3291) loss_scale 4096.0000 (2237.5608) mem 22339MB +[2024-07-25 03:59:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][560/625] eta 0:00:38 lr 0.000758 wd 0.0500 time 0.5725 (0.5851) data time 0.0008 (0.0016) model time 0.5718 (0.5847) loss 7.4081 (7.5252) grad_norm 1.6052 (2.3297) loss_scale 4096.0000 (2270.6881) mem 22339MB +[2024-07-25 03:59:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][570/625] eta 0:00:32 lr 0.000758 wd 0.0500 time 0.5726 (0.5852) data time 0.0008 (0.0016) model time 0.5718 (0.5848) loss 9.0471 (7.5299) grad_norm 1.8995 (2.3224) loss_scale 4096.0000 (2302.6550) mem 22339MB +[2024-07-25 03:59:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][580/625] eta 0:00:26 lr 0.000758 wd 0.0500 time 0.5711 (0.5851) data time 0.0008 (0.0016) model time 0.5703 (0.5846) loss 6.8549 (7.5246) grad_norm 1.8997 (2.3224) loss_scale 4096.0000 (2333.5215) mem 22339MB +[2024-07-25 03:59:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][590/625] eta 0:00:20 lr 0.000758 wd 0.0500 time 0.5729 (0.5849) data time 0.0006 (0.0016) model time 0.5723 (0.5844) loss 6.8986 (7.5305) grad_norm 3.2628 (2.3242) loss_scale 4096.0000 (2363.3435) mem 22339MB +[2024-07-25 03:59:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][600/625] eta 0:00:14 lr 0.000758 wd 0.0500 time 0.5728 (0.5849) data time 0.0006 (0.0015) model time 0.5722 (0.5844) loss 8.7495 (7.5320) grad_norm 1.9464 (2.3195) loss_scale 4096.0000 (2392.1730) mem 22339MB +[2024-07-25 03:59:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][610/625] eta 0:00:08 lr 0.000758 wd 0.0500 time 0.7492 (0.5852) data time 0.0005 (0.0015) model time 0.7486 (0.5848) loss 6.4015 (7.5268) grad_norm 1.7648 (2.3147) loss_scale 4096.0000 (2420.0589) mem 22339MB +[2024-07-25 03:59:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [136/300][620/625] eta 0:00:02 lr 0.000758 wd 0.0500 time 0.6661 (0.5859) data time 0.0005 (0.0015) model time 0.6656 (0.5855) loss 7.6552 (7.5242) grad_norm 3.7216 (2.3204) loss_scale 4096.0000 (2447.0467) mem 22339MB +[2024-07-25 04:00:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 136 training takes 0:06:06 +[2024-07-25 04:00:00 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 04:00:02 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 04:00:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.475 (0.475) Loss 0.4980 (0.4980) Acc@1 89.844 (89.844) Acc@5 98.779 (98.779) Mem 22339MB +[2024-07-25 04:00:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.161) Loss 0.8281 (0.6463) Acc@1 79.346 (85.795) Acc@5 96.045 (97.665) Mem 22339MB +[2024-07-25 04:00:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.144) Loss 0.9272 (0.7647) Acc@1 76.758 (82.536) Acc@5 94.678 (96.405) Mem 22339MB +[2024-07-25 04:00:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.220 Acc@5 96.371 +[2024-07-25 04:00:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.2% +[2024-07-25 04:00:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.798 (0.798) Loss 0.4980 (0.4980) Acc@1 89.404 (89.404) Acc@5 98.633 (98.633) Mem 22339MB +[2024-07-25 04:00:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.188) Loss 0.7852 (0.6283) Acc@1 81.396 (86.417) Acc@5 96.240 (97.736) Mem 22339MB +[2024-07-25 04:00:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.158) Loss 0.9136 (0.7356) Acc@1 77.344 (83.201) Acc@5 95.410 (96.682) Mem 22339MB +[2024-07-25 04:00:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.867 Acc@5 96.683 +[2024-07-25 04:00:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.9% +[2024-07-25 04:00:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.87% +[2024-07-25 04:00:09 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 04:00:11 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 04:00:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][0/625] eta 0:13:05 lr 0.000758 wd 0.0500 time 1.2565 (1.2565) data time 0.7374 (0.7374) model time 0.0000 (0.0000) loss 8.7033 (8.7033) grad_norm 3.0208 (3.0208) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:00:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][10/625] eta 0:07:29 lr 0.000757 wd 0.0500 time 0.7284 (0.7305) data time 0.0006 (0.0677) model time 0.0000 (0.0000) loss 5.9864 (7.0068) grad_norm 2.0039 (2.6792) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:00:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][20/625] eta 0:06:43 lr 0.000757 wd 0.0500 time 0.6952 (0.6675) data time 0.0007 (0.0358) model time 0.0000 (0.0000) loss 8.9322 (7.5116) grad_norm 2.5143 (2.5757) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:00:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][30/625] eta 0:06:18 lr 0.000757 wd 0.0500 time 0.5691 (0.6369) data time 0.0008 (0.0245) model time 0.0000 (0.0000) loss 6.1056 (7.4874) grad_norm 2.2258 (2.5099) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:00:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][40/625] eta 0:06:03 lr 0.000757 wd 0.0500 time 0.5715 (0.6215) data time 0.0008 (0.0187) model time 0.0000 (0.0000) loss 5.8604 (7.4993) grad_norm 1.8433 (2.5077) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:00:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][50/625] eta 0:05:51 lr 0.000757 wd 0.0500 time 0.5743 (0.6120) data time 0.0008 (0.0152) model time 0.0000 (0.0000) loss 7.5874 (7.4731) grad_norm 2.2436 (2.4821) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:00:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][60/625] eta 0:05:42 lr 0.000757 wd 0.0500 time 0.5760 (0.6059) data time 0.0008 (0.0129) model time 0.5752 (0.5735) loss 7.4267 (7.4942) grad_norm 4.7200 (2.6467) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:00:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][70/625] eta 0:05:33 lr 0.000757 wd 0.0500 time 0.5755 (0.6015) data time 0.0008 (0.0112) model time 0.5748 (0.5738) loss 8.5127 (7.4831) grad_norm 2.5860 (2.6310) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:00:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][80/625] eta 0:05:26 lr 0.000757 wd 0.0500 time 0.5710 (0.5982) data time 0.0008 (0.0099) model time 0.5702 (0.5738) loss 8.6532 (7.4806) grad_norm 1.7470 (2.5263) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:01:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][90/625] eta 0:05:18 lr 0.000757 wd 0.0500 time 0.5779 (0.5957) data time 0.0008 (0.0089) model time 0.5771 (0.5741) loss 6.6783 (7.4831) grad_norm 1.6438 (2.4675) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:01:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][100/625] eta 0:05:12 lr 0.000757 wd 0.0500 time 0.5779 (0.5951) data time 0.0008 (0.0082) model time 0.5771 (0.5769) loss 7.5153 (7.4829) grad_norm 2.1813 (2.4360) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:01:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][110/625] eta 0:05:05 lr 0.000756 wd 0.0500 time 0.5772 (0.5933) data time 0.0008 (0.0075) model time 0.5765 (0.5764) loss 8.9498 (7.5023) grad_norm 1.7483 (2.3959) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:01:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][120/625] eta 0:04:58 lr 0.000756 wd 0.0500 time 0.5991 (0.5919) data time 0.0006 (0.0069) model time 0.5985 (0.5764) loss 7.8466 (7.5229) grad_norm 2.5357 (2.3857) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:01:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][130/625] eta 0:04:52 lr 0.000756 wd 0.0500 time 0.5867 (0.5907) data time 0.0008 (0.0065) model time 0.5858 (0.5762) loss 8.0178 (7.5541) grad_norm 1.8224 (2.3503) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:01:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][140/625] eta 0:04:45 lr 0.000756 wd 0.0500 time 0.5857 (0.5896) data time 0.0006 (0.0061) model time 0.5851 (0.5760) loss 5.7478 (7.5454) grad_norm 1.7876 (2.3369) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:01:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][150/625] eta 0:04:39 lr 0.000756 wd 0.0500 time 0.5697 (0.5887) data time 0.0006 (0.0057) model time 0.5691 (0.5759) loss 7.3948 (7.5366) grad_norm 1.6591 (2.3127) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:01:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][160/625] eta 0:04:33 lr 0.000756 wd 0.0500 time 0.5761 (0.5878) data time 0.0008 (0.0054) model time 0.5753 (0.5757) loss 6.8197 (7.5206) grad_norm 2.9286 (2.3102) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:01:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][170/625] eta 0:04:27 lr 0.000756 wd 0.0500 time 0.5745 (0.5870) data time 0.0008 (0.0051) model time 0.5737 (0.5754) loss 6.9563 (7.5408) grad_norm 1.4788 (2.3007) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:01:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][180/625] eta 0:04:20 lr 0.000756 wd 0.0500 time 0.5764 (0.5864) data time 0.0008 (0.0049) model time 0.5756 (0.5755) loss 7.3130 (7.5241) grad_norm 2.3078 (2.2861) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:02:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][190/625] eta 0:04:14 lr 0.000756 wd 0.0500 time 0.5785 (0.5859) data time 0.0006 (0.0047) model time 0.5778 (0.5755) loss 7.6071 (7.5347) grad_norm 2.1264 (2.2842) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:02:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][200/625] eta 0:04:08 lr 0.000756 wd 0.0500 time 0.5768 (0.5857) data time 0.0006 (0.0045) model time 0.5762 (0.5759) loss 7.7501 (7.5515) grad_norm 1.6893 (2.2620) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:02:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][210/625] eta 0:04:04 lr 0.000755 wd 0.0500 time 0.7239 (0.5880) data time 0.0006 (0.0043) model time 0.7233 (0.5795) loss 7.1825 (7.5389) grad_norm 2.0567 (2.2474) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:02:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][220/625] eta 0:03:58 lr 0.000755 wd 0.0500 time 0.6356 (0.5893) data time 0.0008 (0.0042) model time 0.6349 (0.5816) loss 7.0344 (7.5359) grad_norm 2.1499 (2.2365) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:02:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][230/625] eta 0:03:54 lr 0.000755 wd 0.0500 time 0.7678 (0.5924) data time 0.0006 (0.0040) model time 0.7671 (0.5860) loss 8.7821 (7.5341) grad_norm 1.8837 (2.2532) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:02:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][240/625] eta 0:03:48 lr 0.000755 wd 0.0500 time 0.5804 (0.5934) data time 0.0008 (0.0039) model time 0.5796 (0.5875) loss 9.4087 (7.5368) grad_norm 1.6300 (2.2389) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:02:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][250/625] eta 0:03:42 lr 0.000755 wd 0.0500 time 0.5820 (0.5927) data time 0.0007 (0.0038) model time 0.5812 (0.5869) loss 6.6824 (7.5163) grad_norm 1.9525 (2.2352) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:02:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][260/625] eta 0:03:36 lr 0.000755 wd 0.0500 time 0.5759 (0.5922) data time 0.0006 (0.0037) model time 0.5753 (0.5864) loss 8.7881 (7.5214) grad_norm 1.9158 (2.2390) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:02:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][270/625] eta 0:03:30 lr 0.000755 wd 0.0500 time 0.5825 (0.5916) data time 0.0008 (0.0036) model time 0.5817 (0.5859) loss 6.8723 (7.5255) grad_norm 1.7541 (2.2277) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:02:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][280/625] eta 0:03:23 lr 0.000755 wd 0.0500 time 0.5787 (0.5910) data time 0.0009 (0.0035) model time 0.5779 (0.5854) loss 8.1499 (7.5356) grad_norm 2.1992 (2.2355) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:03:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][290/625] eta 0:03:17 lr 0.000755 wd 0.0500 time 0.5815 (0.5905) data time 0.0006 (0.0034) model time 0.5809 (0.5850) loss 7.5169 (7.5476) grad_norm 1.9289 (2.2378) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:03:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][300/625] eta 0:03:11 lr 0.000754 wd 0.0500 time 0.5848 (0.5901) data time 0.0006 (0.0033) model time 0.5841 (0.5847) loss 6.4118 (7.5501) grad_norm 1.6481 (2.2397) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:03:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][310/625] eta 0:03:05 lr 0.000754 wd 0.0500 time 0.5758 (0.5896) data time 0.0008 (0.0032) model time 0.5750 (0.5843) loss 7.4573 (7.5627) grad_norm 1.4919 (2.2315) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:03:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][320/625] eta 0:02:59 lr 0.000754 wd 0.0500 time 0.6258 (0.5898) data time 0.0006 (0.0032) model time 0.6252 (0.5848) loss 6.7042 (7.5596) grad_norm 1.9991 (2.2317) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:03:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][330/625] eta 0:02:53 lr 0.000754 wd 0.0500 time 0.5846 (0.5895) data time 0.0007 (0.0031) model time 0.5839 (0.5845) loss 6.2212 (7.5557) grad_norm 2.0816 (2.2301) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:03:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][340/625] eta 0:02:47 lr 0.000754 wd 0.0500 time 0.5745 (0.5892) data time 0.0006 (0.0031) model time 0.5738 (0.5842) loss 7.9374 (7.5605) grad_norm 1.9435 (2.2307) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:03:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][350/625] eta 0:02:41 lr 0.000754 wd 0.0500 time 0.5755 (0.5888) data time 0.0006 (0.0030) model time 0.5750 (0.5839) loss 6.4764 (7.5507) grad_norm 2.0307 (2.2347) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:03:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][360/625] eta 0:02:35 lr 0.000754 wd 0.0500 time 0.5758 (0.5885) data time 0.0008 (0.0029) model time 0.5750 (0.5837) loss 9.3978 (7.5515) grad_norm 2.4761 (inf) loss_scale 2048.0000 (4056.2881) mem 22339MB +[2024-07-25 04:03:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][370/625] eta 0:02:29 lr 0.000754 wd 0.0500 time 0.5759 (0.5881) data time 0.0008 (0.0029) model time 0.5751 (0.5834) loss 6.8434 (7.5467) grad_norm 2.0903 (inf) loss_scale 2048.0000 (4002.1563) mem 22339MB +[2024-07-25 04:03:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][380/625] eta 0:02:24 lr 0.000754 wd 0.0500 time 0.5759 (0.5878) data time 0.0008 (0.0029) model time 0.5751 (0.5831) loss 6.9608 (7.5403) grad_norm 2.6668 (inf) loss_scale 2048.0000 (3950.8661) mem 22339MB +[2024-07-25 04:04:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][390/625] eta 0:02:18 lr 0.000754 wd 0.0500 time 0.5755 (0.5875) data time 0.0006 (0.0028) model time 0.5749 (0.5829) loss 7.2768 (7.5366) grad_norm 1.9059 (inf) loss_scale 2048.0000 (3902.1995) mem 22339MB +[2024-07-25 04:04:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][400/625] eta 0:02:12 lr 0.000753 wd 0.0500 time 0.5861 (0.5872) data time 0.0007 (0.0028) model time 0.5854 (0.5827) loss 8.9725 (7.5392) grad_norm 2.7271 (inf) loss_scale 2048.0000 (3855.9601) mem 22339MB +[2024-07-25 04:04:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][410/625] eta 0:02:06 lr 0.000753 wd 0.0500 time 0.6079 (0.5870) data time 0.0008 (0.0027) model time 0.6071 (0.5825) loss 8.2643 (7.5374) grad_norm 2.5898 (inf) loss_scale 2048.0000 (3811.9708) mem 22339MB +[2024-07-25 04:04:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][420/625] eta 0:02:00 lr 0.000753 wd 0.0500 time 0.5756 (0.5870) data time 0.0007 (0.0027) model time 0.5749 (0.5826) loss 7.2836 (7.5351) grad_norm 1.6134 (inf) loss_scale 2048.0000 (3770.0713) mem 22339MB +[2024-07-25 04:04:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][430/625] eta 0:01:54 lr 0.000753 wd 0.0500 time 0.7296 (0.5886) data time 0.0006 (0.0026) model time 0.7290 (0.5845) loss 8.2491 (7.5483) grad_norm 2.4273 (inf) loss_scale 2048.0000 (3730.1160) mem 22339MB +[2024-07-25 04:04:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][440/625] eta 0:01:48 lr 0.000753 wd 0.0500 time 0.7027 (0.5890) data time 0.0007 (0.0026) model time 0.7020 (0.5851) loss 6.3335 (7.5442) grad_norm 2.9166 (inf) loss_scale 2048.0000 (3691.9728) mem 22339MB +[2024-07-25 04:04:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][450/625] eta 0:01:43 lr 0.000753 wd 0.0500 time 0.7483 (0.5908) data time 0.0008 (0.0025) model time 0.7475 (0.5871) loss 8.1100 (7.5498) grad_norm 2.2471 (inf) loss_scale 2048.0000 (3655.5211) mem 22339MB +[2024-07-25 04:04:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][460/625] eta 0:01:37 lr 0.000753 wd 0.0500 time 0.5748 (0.5912) data time 0.0006 (0.0025) model time 0.5742 (0.5877) loss 8.6122 (7.5535) grad_norm 1.7344 (inf) loss_scale 2048.0000 (3620.6508) mem 22339MB +[2024-07-25 04:04:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][470/625] eta 0:01:31 lr 0.000753 wd 0.0500 time 0.5832 (0.5909) data time 0.0008 (0.0025) model time 0.5824 (0.5874) loss 7.5941 (7.5556) grad_norm 2.0444 (inf) loss_scale 2048.0000 (3587.2611) mem 22339MB +[2024-07-25 04:04:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][480/625] eta 0:01:25 lr 0.000753 wd 0.0500 time 0.5741 (0.5905) data time 0.0008 (0.0024) model time 0.5733 (0.5871) loss 7.5035 (7.5526) grad_norm 4.0733 (inf) loss_scale 2048.0000 (3555.2599) mem 22339MB +[2024-07-25 04:05:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][490/625] eta 0:01:19 lr 0.000753 wd 0.0500 time 0.5767 (0.5903) data time 0.0008 (0.0024) model time 0.5760 (0.5868) loss 7.8956 (7.5500) grad_norm 2.4361 (inf) loss_scale 2048.0000 (3524.5621) mem 22339MB +[2024-07-25 04:05:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][500/625] eta 0:01:13 lr 0.000752 wd 0.0500 time 0.5793 (0.5900) data time 0.0009 (0.0024) model time 0.5784 (0.5866) loss 6.0607 (7.5543) grad_norm 1.8900 (inf) loss_scale 2048.0000 (3495.0898) mem 22339MB +[2024-07-25 04:05:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][510/625] eta 0:01:07 lr 0.000752 wd 0.0500 time 0.5782 (0.5897) data time 0.0008 (0.0023) model time 0.5774 (0.5863) loss 8.8559 (7.5488) grad_norm 2.4037 (inf) loss_scale 2048.0000 (3466.7710) mem 22339MB +[2024-07-25 04:05:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][520/625] eta 0:01:01 lr 0.000752 wd 0.0500 time 0.5793 (0.5894) data time 0.0009 (0.0023) model time 0.5783 (0.5861) loss 7.3847 (7.5408) grad_norm 1.8488 (inf) loss_scale 2048.0000 (3439.5393) mem 22339MB +[2024-07-25 04:05:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][530/625] eta 0:00:55 lr 0.000752 wd 0.0500 time 0.5812 (0.5892) data time 0.0008 (0.0023) model time 0.5804 (0.5858) loss 6.8336 (7.5491) grad_norm 1.4415 (inf) loss_scale 2048.0000 (3413.3333) mem 22339MB +[2024-07-25 04:05:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][540/625] eta 0:00:50 lr 0.000752 wd 0.0500 time 0.5729 (0.5892) data time 0.0007 (0.0023) model time 0.5722 (0.5860) loss 5.5932 (7.5480) grad_norm 2.1196 (inf) loss_scale 2048.0000 (3388.0961) mem 22339MB +[2024-07-25 04:05:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][550/625] eta 0:00:44 lr 0.000752 wd 0.0500 time 0.5778 (0.5890) data time 0.0006 (0.0022) model time 0.5772 (0.5857) loss 8.0589 (7.5499) grad_norm 1.9058 (inf) loss_scale 2048.0000 (3363.7750) mem 22339MB +[2024-07-25 04:05:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][560/625] eta 0:00:38 lr 0.000752 wd 0.0500 time 0.5779 (0.5887) data time 0.0007 (0.0022) model time 0.5773 (0.5855) loss 6.5798 (7.5415) grad_norm 1.5266 (inf) loss_scale 2048.0000 (3340.3209) mem 22339MB +[2024-07-25 04:05:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][570/625] eta 0:00:32 lr 0.000752 wd 0.0500 time 0.5810 (0.5885) data time 0.0006 (0.0022) model time 0.5804 (0.5853) loss 6.9667 (7.5441) grad_norm 2.5313 (inf) loss_scale 2048.0000 (3317.6883) mem 22339MB +[2024-07-25 04:05:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][580/625] eta 0:00:26 lr 0.000752 wd 0.0500 time 0.5817 (0.5883) data time 0.0008 (0.0022) model time 0.5809 (0.5851) loss 8.1932 (7.5495) grad_norm 3.2384 (inf) loss_scale 2048.0000 (3295.8348) mem 22339MB +[2024-07-25 04:05:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][590/625] eta 0:00:20 lr 0.000752 wd 0.0500 time 0.5775 (0.5881) data time 0.0008 (0.0021) model time 0.5767 (0.5849) loss 7.5157 (7.5525) grad_norm 2.0388 (inf) loss_scale 2048.0000 (3274.7208) mem 22339MB +[2024-07-25 04:06:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][600/625] eta 0:00:14 lr 0.000751 wd 0.0500 time 0.5806 (0.5879) data time 0.0006 (0.0021) model time 0.5800 (0.5847) loss 7.0164 (7.5547) grad_norm 3.1346 (inf) loss_scale 2048.0000 (3254.3095) mem 22339MB +[2024-07-25 04:06:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][610/625] eta 0:00:08 lr 0.000751 wd 0.0500 time 0.5750 (0.5876) data time 0.0006 (0.0021) model time 0.5745 (0.5845) loss 6.6692 (7.5517) grad_norm 2.1062 (inf) loss_scale 2048.0000 (3234.5663) mem 22339MB +[2024-07-25 04:06:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [137/300][620/625] eta 0:00:02 lr 0.000751 wd 0.0500 time 0.5737 (0.5874) data time 0.0005 (0.0021) model time 0.5731 (0.5843) loss 7.7798 (7.5605) grad_norm 1.7017 (inf) loss_scale 2048.0000 (3215.4589) mem 22339MB +[2024-07-25 04:06:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 137 training takes 0:06:07 +[2024-07-25 04:06:18 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 04:06:19 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 04:06:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.479 (0.479) Loss 0.5059 (0.5059) Acc@1 89.648 (89.648) Acc@5 98.730 (98.730) Mem 22339MB +[2024-07-25 04:06:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.8545 (0.6619) Acc@1 80.371 (85.862) Acc@5 95.996 (97.630) Mem 22339MB +[2024-07-25 04:06:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9224 (0.7664) Acc@1 78.174 (82.931) Acc@5 94.873 (96.498) Mem 22339MB +[2024-07-25 04:06:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.548 Acc@5 96.477 +[2024-07-25 04:06:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.5% +[2024-07-25 04:06:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 82.55% +[2024-07-25 04:06:23 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 04:06:24 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 04:06:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.473 (0.473) Loss 0.4976 (0.4976) Acc@1 89.502 (89.502) Acc@5 98.682 (98.682) Mem 22339MB +[2024-07-25 04:06:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.7837 (0.6277) Acc@1 81.348 (86.430) Acc@5 96.240 (97.741) Mem 22339MB +[2024-07-25 04:06:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9131 (0.7350) Acc@1 77.441 (83.233) Acc@5 95.459 (96.698) Mem 22339MB +[2024-07-25 04:06:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.889 Acc@5 96.697 +[2024-07-25 04:06:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.9% +[2024-07-25 04:06:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.89% +[2024-07-25 04:06:28 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 04:06:29 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 04:06:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][0/625] eta 0:08:54 lr 0.000751 wd 0.0500 time 0.8547 (0.8547) data time 0.3360 (0.3360) model time 0.0000 (0.0000) loss 7.9021 (7.9021) grad_norm 1.8868 (1.8868) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:06:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][10/625] eta 0:06:10 lr 0.000751 wd 0.0500 time 0.5702 (0.6022) data time 0.0006 (0.0313) model time 0.0000 (0.0000) loss 8.0410 (7.4537) grad_norm 2.0700 (2.0171) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:06:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][20/625] eta 0:06:05 lr 0.000751 wd 0.0500 time 0.5718 (0.6041) data time 0.0006 (0.0168) model time 0.0000 (0.0000) loss 7.9352 (7.4721) grad_norm 2.2850 (2.0816) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:06:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][30/625] eta 0:06:03 lr 0.000751 wd 0.0500 time 0.5689 (0.6105) data time 0.0006 (0.0117) model time 0.0000 (0.0000) loss 5.9385 (7.3425) grad_norm 2.4943 (2.0307) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:06:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][40/625] eta 0:05:58 lr 0.000751 wd 0.0500 time 0.7440 (0.6132) data time 0.0006 (0.0090) model time 0.0000 (0.0000) loss 6.3342 (7.2922) grad_norm 2.6219 (2.0675) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:07:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][50/625] eta 0:05:55 lr 0.000751 wd 0.0500 time 0.6969 (0.6186) data time 0.0006 (0.0074) model time 0.0000 (0.0000) loss 7.1376 (7.3649) grad_norm 1.4102 (2.0826) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:07:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][60/625] eta 0:05:47 lr 0.000751 wd 0.0500 time 0.5624 (0.6152) data time 0.0008 (0.0063) model time 0.5615 (0.5974) loss 7.3165 (7.3779) grad_norm 2.4283 (2.1652) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:07:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][70/625] eta 0:05:38 lr 0.000750 wd 0.0500 time 0.5720 (0.6097) data time 0.0006 (0.0058) model time 0.5713 (0.5856) loss 6.7810 (7.3888) grad_norm 1.8141 (2.1964) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:07:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][80/625] eta 0:05:30 lr 0.000750 wd 0.0500 time 0.5725 (0.6057) data time 0.0008 (0.0051) model time 0.5717 (0.5825) loss 6.1998 (7.3657) grad_norm 1.5951 (2.1742) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:07:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][90/625] eta 0:05:22 lr 0.000750 wd 0.0500 time 0.5684 (0.6029) data time 0.0010 (0.0047) model time 0.5674 (0.5817) loss 6.6268 (7.3744) grad_norm 2.3870 (2.1881) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:07:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][100/625] eta 0:05:15 lr 0.000750 wd 0.0500 time 0.5741 (0.6011) data time 0.0008 (0.0043) model time 0.5732 (0.5821) loss 8.3729 (7.4014) grad_norm 2.3582 (2.2206) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:07:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][110/625] eta 0:05:08 lr 0.000750 wd 0.0500 time 0.5626 (0.5991) data time 0.0007 (0.0040) model time 0.5619 (0.5816) loss 8.8765 (7.4335) grad_norm 1.8572 (2.2323) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:07:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][120/625] eta 0:05:01 lr 0.000750 wd 0.0500 time 0.5727 (0.5972) data time 0.0010 (0.0037) model time 0.5718 (0.5807) loss 6.0003 (7.4377) grad_norm 1.5431 (2.2199) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:07:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][130/625] eta 0:04:54 lr 0.000750 wd 0.0500 time 0.5729 (0.5955) data time 0.0006 (0.0035) model time 0.5722 (0.5799) loss 7.7385 (7.4689) grad_norm 1.7859 (2.2223) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:07:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][140/625] eta 0:04:48 lr 0.000750 wd 0.0500 time 0.5709 (0.5942) data time 0.0007 (0.0033) model time 0.5702 (0.5794) loss 8.0600 (7.4836) grad_norm 2.4695 (2.2208) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:07:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][150/625] eta 0:04:41 lr 0.000750 wd 0.0500 time 0.5729 (0.5930) data time 0.0006 (0.0031) model time 0.5723 (0.5790) loss 7.9767 (7.4979) grad_norm 2.6086 (2.2224) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:08:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][160/625] eta 0:04:35 lr 0.000749 wd 0.0500 time 0.5770 (0.5919) data time 0.0008 (0.0030) model time 0.5762 (0.5786) loss 6.8127 (7.4880) grad_norm 1.5899 (2.2073) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:08:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][170/625] eta 0:04:28 lr 0.000749 wd 0.0500 time 0.5738 (0.5911) data time 0.0009 (0.0029) model time 0.5729 (0.5785) loss 8.3096 (7.5105) grad_norm 2.3846 (2.2130) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:08:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][180/625] eta 0:04:22 lr 0.000749 wd 0.0500 time 0.5743 (0.5902) data time 0.0008 (0.0028) model time 0.5736 (0.5782) loss 7.2501 (7.5042) grad_norm 1.9695 (2.2218) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:08:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][190/625] eta 0:04:16 lr 0.000749 wd 0.0500 time 0.5651 (0.5894) data time 0.0008 (0.0027) model time 0.5643 (0.5778) loss 8.6221 (7.5227) grad_norm 2.8058 (2.2282) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:08:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][200/625] eta 0:04:10 lr 0.000749 wd 0.0500 time 0.5707 (0.5887) data time 0.0006 (0.0026) model time 0.5700 (0.5776) loss 8.1937 (7.5161) grad_norm 3.3193 (2.2432) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:08:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][210/625] eta 0:04:04 lr 0.000749 wd 0.0500 time 0.5730 (0.5881) data time 0.0006 (0.0025) model time 0.5725 (0.5775) loss 7.0071 (7.5294) grad_norm 2.7548 (2.2632) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:08:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][220/625] eta 0:03:57 lr 0.000749 wd 0.0500 time 0.5760 (0.5876) data time 0.0006 (0.0024) model time 0.5754 (0.5775) loss 6.8807 (7.5282) grad_norm 2.4701 (2.2631) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:08:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][230/625] eta 0:03:51 lr 0.000749 wd 0.0500 time 0.5725 (0.5871) data time 0.0009 (0.0023) model time 0.5717 (0.5773) loss 8.3343 (7.5046) grad_norm 2.5979 (2.2520) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:08:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][240/625] eta 0:03:46 lr 0.000749 wd 0.0500 time 0.7299 (0.5880) data time 0.0008 (0.0023) model time 0.7290 (0.5790) loss 5.7899 (7.5196) grad_norm 2.3539 (2.2744) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:08:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][250/625] eta 0:03:41 lr 0.000749 wd 0.0500 time 0.5737 (0.5896) data time 0.0007 (0.0022) model time 0.5730 (0.5814) loss 8.2758 (7.5184) grad_norm 2.0918 (2.2774) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:09:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][260/625] eta 0:03:35 lr 0.000748 wd 0.0500 time 0.6838 (0.5915) data time 0.0006 (0.0022) model time 0.6831 (0.5841) loss 7.9276 (7.5417) grad_norm 2.5135 (2.2783) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:09:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][270/625] eta 0:03:30 lr 0.000748 wd 0.0500 time 0.5698 (0.5936) data time 0.0008 (0.0021) model time 0.5690 (0.5870) loss 8.4488 (7.5450) grad_norm 1.9879 (2.2660) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:09:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][280/625] eta 0:03:24 lr 0.000748 wd 0.0500 time 0.5651 (0.5941) data time 0.0006 (0.0021) model time 0.5645 (0.5878) loss 6.7531 (7.5405) grad_norm 1.5483 (2.2590) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:09:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][290/625] eta 0:03:18 lr 0.000748 wd 0.0500 time 0.5711 (0.5936) data time 0.0008 (0.0020) model time 0.5703 (0.5874) loss 7.2114 (7.5227) grad_norm 1.9468 (2.2465) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:09:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][300/625] eta 0:03:12 lr 0.000748 wd 0.0500 time 0.5750 (0.5930) data time 0.0009 (0.0020) model time 0.5741 (0.5869) loss 8.1064 (7.5203) grad_norm 1.8077 (2.2442) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:09:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][310/625] eta 0:03:06 lr 0.000748 wd 0.0500 time 0.5687 (0.5925) data time 0.0006 (0.0020) model time 0.5681 (0.5865) loss 7.9454 (7.5148) grad_norm 2.4160 (2.2478) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:09:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][320/625] eta 0:03:00 lr 0.000748 wd 0.0500 time 0.5730 (0.5920) data time 0.0006 (0.0020) model time 0.5724 (0.5861) loss 8.3541 (7.5118) grad_norm 3.0179 (2.2442) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:09:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][330/625] eta 0:02:54 lr 0.000748 wd 0.0500 time 0.5635 (0.5916) data time 0.0011 (0.0019) model time 0.5624 (0.5858) loss 7.7875 (7.5182) grad_norm 2.3584 (2.2465) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:09:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][340/625] eta 0:02:48 lr 0.000748 wd 0.0500 time 0.5740 (0.5911) data time 0.0006 (0.0019) model time 0.5734 (0.5854) loss 6.0619 (7.5116) grad_norm 2.6961 (2.2492) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:09:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][350/625] eta 0:02:42 lr 0.000748 wd 0.0500 time 0.5727 (0.5907) data time 0.0006 (0.0019) model time 0.5721 (0.5850) loss 8.3743 (7.5168) grad_norm 2.3375 (2.2492) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:10:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][360/625] eta 0:02:36 lr 0.000747 wd 0.0500 time 0.5724 (0.5903) data time 0.0006 (0.0018) model time 0.5718 (0.5847) loss 8.3397 (7.5163) grad_norm 2.2334 (2.2501) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:10:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][370/625] eta 0:02:30 lr 0.000747 wd 0.0500 time 0.5695 (0.5899) data time 0.0006 (0.0018) model time 0.5689 (0.5844) loss 8.0795 (7.5102) grad_norm 1.7575 (2.2470) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:10:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][380/625] eta 0:02:24 lr 0.000747 wd 0.0500 time 0.5746 (0.5896) data time 0.0007 (0.0018) model time 0.5739 (0.5842) loss 8.7882 (7.5172) grad_norm 1.7654 (2.2396) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:10:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][390/625] eta 0:02:18 lr 0.000747 wd 0.0500 time 0.5743 (0.5892) data time 0.0008 (0.0018) model time 0.5735 (0.5839) loss 6.6876 (7.5070) grad_norm 1.7452 (2.2446) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:10:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][400/625] eta 0:02:12 lr 0.000747 wd 0.0500 time 0.5738 (0.5889) data time 0.0006 (0.0017) model time 0.5732 (0.5836) loss 5.4697 (7.5012) grad_norm 1.5298 (2.2411) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:10:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][410/625] eta 0:02:06 lr 0.000747 wd 0.0500 time 0.5625 (0.5886) data time 0.0006 (0.0017) model time 0.5618 (0.5835) loss 8.6913 (7.4963) grad_norm 3.1765 (2.2622) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:10:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][420/625] eta 0:02:00 lr 0.000747 wd 0.0500 time 0.5725 (0.5883) data time 0.0008 (0.0017) model time 0.5717 (0.5833) loss 7.8786 (7.4957) grad_norm 2.4227 (2.2707) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:10:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][430/625] eta 0:01:54 lr 0.000747 wd 0.0500 time 0.5715 (0.5881) data time 0.0007 (0.0017) model time 0.5708 (0.5831) loss 6.1747 (7.4948) grad_norm 1.9231 (2.2704) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:10:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][440/625] eta 0:01:48 lr 0.000747 wd 0.0500 time 0.5721 (0.5878) data time 0.0009 (0.0016) model time 0.5713 (0.5829) loss 8.1835 (7.5066) grad_norm 2.3123 (2.2627) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:10:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][450/625] eta 0:01:42 lr 0.000746 wd 0.0500 time 0.5722 (0.5875) data time 0.0008 (0.0016) model time 0.5714 (0.5827) loss 7.7032 (7.4982) grad_norm 2.3074 (2.2585) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:11:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][460/625] eta 0:01:36 lr 0.000746 wd 0.0500 time 0.5719 (0.5879) data time 0.0008 (0.0016) model time 0.5711 (0.5832) loss 6.3706 (7.5015) grad_norm 2.6219 (2.2569) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:11:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][470/625] eta 0:01:31 lr 0.000746 wd 0.0500 time 0.5691 (0.5888) data time 0.0008 (0.0016) model time 0.5683 (0.5843) loss 8.8497 (7.5015) grad_norm 4.8869 (2.2670) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:11:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][480/625] eta 0:01:25 lr 0.000746 wd 0.0500 time 0.5654 (0.5901) data time 0.0006 (0.0016) model time 0.5648 (0.5859) loss 7.1885 (7.5002) grad_norm 1.6217 (2.2603) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:11:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][490/625] eta 0:01:19 lr 0.000746 wd 0.0500 time 0.7193 (0.5913) data time 0.0007 (0.0016) model time 0.7186 (0.5873) loss 8.5146 (7.5119) grad_norm 1.3371 (2.2556) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:11:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][500/625] eta 0:01:13 lr 0.000746 wd 0.0500 time 0.5724 (0.5917) data time 0.0006 (0.0015) model time 0.5718 (0.5878) loss 6.2742 (7.5084) grad_norm 3.8276 (2.2689) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:11:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][510/625] eta 0:01:08 lr 0.000746 wd 0.0500 time 0.5728 (0.5914) data time 0.0006 (0.0015) model time 0.5722 (0.5875) loss 5.7748 (7.5018) grad_norm 2.0094 (2.2686) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:11:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][520/625] eta 0:01:02 lr 0.000746 wd 0.0500 time 0.5729 (0.5911) data time 0.0007 (0.0015) model time 0.5722 (0.5872) loss 7.8793 (7.4986) grad_norm 2.6218 (2.2652) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:11:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][530/625] eta 0:00:56 lr 0.000746 wd 0.0500 time 0.5729 (0.5908) data time 0.0006 (0.0015) model time 0.5723 (0.5869) loss 6.6599 (7.4884) grad_norm 2.2014 (2.2658) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:11:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][540/625] eta 0:00:50 lr 0.000746 wd 0.0500 time 0.5708 (0.5905) data time 0.0006 (0.0015) model time 0.5701 (0.5867) loss 6.3631 (7.4876) grad_norm 1.8664 (2.2649) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:11:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][550/625] eta 0:00:44 lr 0.000745 wd 0.0500 time 0.5697 (0.5902) data time 0.0009 (0.0015) model time 0.5688 (0.5864) loss 7.5648 (7.4864) grad_norm 2.2360 (2.2671) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:12:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][560/625] eta 0:00:38 lr 0.000745 wd 0.0500 time 0.5701 (0.5900) data time 0.0007 (0.0015) model time 0.5695 (0.5862) loss 6.7570 (7.4777) grad_norm 2.8002 (2.2648) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:12:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][570/625] eta 0:00:32 lr 0.000745 wd 0.0500 time 0.5737 (0.5897) data time 0.0008 (0.0015) model time 0.5729 (0.5860) loss 6.2410 (7.4760) grad_norm 1.5230 (2.2629) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:12:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][580/625] eta 0:00:26 lr 0.000745 wd 0.0500 time 0.5733 (0.5895) data time 0.0008 (0.0015) model time 0.5726 (0.5858) loss 7.1566 (7.4716) grad_norm 2.1081 (2.2581) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:12:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][590/625] eta 0:00:20 lr 0.000745 wd 0.0500 time 0.5723 (0.5892) data time 0.0008 (0.0015) model time 0.5714 (0.5856) loss 8.9725 (7.4764) grad_norm 1.8872 (2.2579) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:12:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][600/625] eta 0:00:14 lr 0.000745 wd 0.0500 time 0.5698 (0.5890) data time 0.0006 (0.0015) model time 0.5691 (0.5853) loss 8.2904 (7.4804) grad_norm 1.9872 (2.2529) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:12:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][610/625] eta 0:00:08 lr 0.000745 wd 0.0500 time 0.5735 (0.5887) data time 0.0006 (0.0014) model time 0.5729 (0.5851) loss 7.0993 (7.4807) grad_norm 1.6228 (2.2542) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:12:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [138/300][620/625] eta 0:00:02 lr 0.000745 wd 0.0500 time 0.5692 (0.5885) data time 0.0006 (0.0014) model time 0.5686 (0.5849) loss 9.0455 (7.4837) grad_norm 2.6427 (2.2558) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:12:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 138 training takes 0:06:07 +[2024-07-25 04:12:37 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 04:12:39 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 04:12:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.483 (0.483) Loss 0.5098 (0.5098) Acc@1 89.795 (89.795) Acc@5 98.779 (98.779) Mem 22339MB +[2024-07-25 04:12:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.8438 (0.6565) Acc@1 79.346 (85.693) Acc@5 95.898 (97.634) Mem 22339MB +[2024-07-25 04:12:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.143) Loss 0.9487 (0.7701) Acc@1 76.904 (82.666) Acc@5 94.580 (96.356) Mem 22339MB +[2024-07-25 04:12:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.320 Acc@5 96.337 +[2024-07-25 04:12:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.3% +[2024-07-25 04:12:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.843 (0.843) Loss 0.4958 (0.4958) Acc@1 89.600 (89.600) Acc@5 98.682 (98.682) Mem 22339MB +[2024-07-25 04:12:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.191) Loss 0.7827 (0.6274) Acc@1 81.396 (86.475) Acc@5 96.240 (97.727) Mem 22339MB +[2024-07-25 04:12:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.160) Loss 0.9116 (0.7345) Acc@1 77.490 (83.266) Acc@5 95.361 (96.682) Mem 22339MB +[2024-07-25 04:12:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.921 Acc@5 96.683 +[2024-07-25 04:12:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 82.9% +[2024-07-25 04:12:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.92% +[2024-07-25 04:12:46 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 04:12:47 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 04:12:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][0/625] eta 0:09:00 lr 0.000745 wd 0.0500 time 0.8652 (0.8652) data time 0.3471 (0.3471) model time 0.0000 (0.0000) loss 8.4272 (8.4272) grad_norm 1.8302 (1.8302) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:12:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][10/625] eta 0:06:19 lr 0.000745 wd 0.0500 time 0.5189 (0.6168) data time 0.0009 (0.0323) model time 0.0000 (0.0000) loss 8.1768 (7.2925) grad_norm 2.0716 (2.2160) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:13:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][20/625] eta 0:06:02 lr 0.000744 wd 0.0500 time 0.5729 (0.5984) data time 0.0008 (0.0173) model time 0.0000 (0.0000) loss 7.3556 (7.3163) grad_norm 1.9558 (2.3877) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:13:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][30/625] eta 0:05:51 lr 0.000744 wd 0.0500 time 0.5748 (0.5907) data time 0.0006 (0.0120) model time 0.0000 (0.0000) loss 7.6163 (7.4442) grad_norm 2.0525 (2.3172) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:13:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][40/625] eta 0:05:43 lr 0.000744 wd 0.0500 time 0.5725 (0.5866) data time 0.0006 (0.0093) model time 0.0000 (0.0000) loss 9.1868 (7.6734) grad_norm 3.1963 (2.3914) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:13:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][50/625] eta 0:05:37 lr 0.000744 wd 0.0500 time 0.7275 (0.5874) data time 0.0008 (0.0076) model time 0.0000 (0.0000) loss 8.0860 (7.6050) grad_norm 2.6945 (2.4200) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:13:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][60/625] eta 0:05:35 lr 0.000744 wd 0.0500 time 0.7112 (0.5943) data time 0.0006 (0.0065) model time 0.7106 (0.6286) loss 6.3843 (7.5108) grad_norm 2.9547 (2.5414) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:13:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][70/625] eta 0:05:33 lr 0.000744 wd 0.0500 time 0.7706 (0.6007) data time 0.0006 (0.0057) model time 0.7700 (0.6338) loss 6.8693 (7.4814) grad_norm 2.1994 (2.5184) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:13:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][80/625] eta 0:05:28 lr 0.000744 wd 0.0500 time 0.6744 (0.6025) data time 0.0008 (0.0051) model time 0.6736 (0.6274) loss 8.3779 (7.5217) grad_norm 2.0290 (2.5117) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:13:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][90/625] eta 0:05:23 lr 0.000744 wd 0.0500 time 0.5628 (0.6045) data time 0.0008 (0.0047) model time 0.5620 (0.6255) loss 6.9310 (7.5056) grad_norm 2.0764 (2.4851) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:13:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][100/625] eta 0:05:16 lr 0.000744 wd 0.0500 time 0.5757 (0.6034) data time 0.0006 (0.0043) model time 0.5751 (0.6189) loss 8.3057 (7.4868) grad_norm 2.3887 (2.4747) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:13:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][110/625] eta 0:05:09 lr 0.000744 wd 0.0500 time 0.5725 (0.6010) data time 0.0006 (0.0040) model time 0.5718 (0.6116) loss 5.9400 (7.4752) grad_norm 2.0493 (2.4985) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:14:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][120/625] eta 0:05:02 lr 0.000743 wd 0.0500 time 0.5635 (0.5989) data time 0.0006 (0.0037) model time 0.5629 (0.6063) loss 8.3931 (7.5106) grad_norm 1.8996 (2.4700) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:14:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][130/625] eta 0:04:55 lr 0.000743 wd 0.0500 time 0.5644 (0.5970) data time 0.0006 (0.0035) model time 0.5638 (0.6022) loss 7.5052 (7.5160) grad_norm 3.6087 (2.4655) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:14:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][140/625] eta 0:04:48 lr 0.000743 wd 0.0500 time 0.5741 (0.5954) data time 0.0010 (0.0033) model time 0.5731 (0.5991) loss 8.1706 (7.5366) grad_norm 2.2848 (2.4617) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:14:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][150/625] eta 0:04:42 lr 0.000743 wd 0.0500 time 0.5684 (0.5944) data time 0.0006 (0.0032) model time 0.5678 (0.5971) loss 6.9354 (7.5629) grad_norm 2.1203 (2.4587) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:14:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][160/625] eta 0:04:35 lr 0.000743 wd 0.0500 time 0.5643 (0.5935) data time 0.0006 (0.0030) model time 0.5637 (0.5954) loss 6.7653 (7.5601) grad_norm 2.5896 (2.4324) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:14:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][170/625] eta 0:04:29 lr 0.000743 wd 0.0500 time 0.5711 (0.5924) data time 0.0008 (0.0029) model time 0.5703 (0.5937) loss 6.8552 (7.5617) grad_norm 2.1320 (2.4494) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:14:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][180/625] eta 0:04:23 lr 0.000743 wd 0.0500 time 0.5727 (0.5915) data time 0.0006 (0.0028) model time 0.5721 (0.5922) loss 7.1181 (7.5560) grad_norm 1.7117 (2.4352) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:14:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][190/625] eta 0:04:16 lr 0.000743 wd 0.0500 time 0.5829 (0.5907) data time 0.0008 (0.0027) model time 0.5821 (0.5910) loss 8.6357 (7.5955) grad_norm 2.5318 (2.4273) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:14:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][200/625] eta 0:04:10 lr 0.000743 wd 0.0500 time 0.5702 (0.5898) data time 0.0008 (0.0026) model time 0.5694 (0.5898) loss 8.7189 (7.6267) grad_norm 2.4034 (2.4175) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:14:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][210/625] eta 0:04:04 lr 0.000742 wd 0.0500 time 0.5728 (0.5891) data time 0.0006 (0.0025) model time 0.5722 (0.5887) loss 6.4358 (7.6143) grad_norm 3.0577 (2.4119) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:14:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][220/625] eta 0:03:58 lr 0.000742 wd 0.0500 time 0.5696 (0.5884) data time 0.0008 (0.0024) model time 0.5688 (0.5878) loss 8.5769 (7.6214) grad_norm 2.0148 (2.4128) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:15:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][230/625] eta 0:03:52 lr 0.000742 wd 0.0500 time 0.7005 (0.5885) data time 0.0008 (0.0024) model time 0.6997 (0.5879) loss 7.8334 (7.6211) grad_norm 2.3734 (2.4209) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:15:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][240/625] eta 0:03:46 lr 0.000742 wd 0.0500 time 0.5724 (0.5877) data time 0.0006 (0.0023) model time 0.5718 (0.5869) loss 6.9685 (7.6242) grad_norm 2.5566 (2.4113) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:15:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][250/625] eta 0:03:40 lr 0.000742 wd 0.0500 time 0.5804 (0.5872) data time 0.0008 (0.0022) model time 0.5796 (0.5863) loss 8.0063 (7.6375) grad_norm 1.4786 (2.4109) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:15:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][260/625] eta 0:03:34 lr 0.000742 wd 0.0500 time 0.5649 (0.5867) data time 0.0008 (0.0022) model time 0.5641 (0.5857) loss 7.9640 (7.6388) grad_norm 1.9727 (2.4077) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:15:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][270/625] eta 0:03:28 lr 0.000742 wd 0.0500 time 0.7356 (0.5869) data time 0.0006 (0.0021) model time 0.7350 (0.5859) loss 6.8708 (7.6153) grad_norm 5.2584 (2.4340) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:15:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][280/625] eta 0:03:22 lr 0.000742 wd 0.0500 time 0.7279 (0.5874) data time 0.0008 (0.0021) model time 0.7271 (0.5866) loss 6.3964 (7.6085) grad_norm 1.6016 (2.4381) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:15:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][290/625] eta 0:03:17 lr 0.000742 wd 0.0500 time 0.5683 (0.5882) data time 0.0009 (0.0020) model time 0.5675 (0.5875) loss 8.4568 (7.6127) grad_norm 2.2180 (2.4386) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:15:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][300/625] eta 0:03:11 lr 0.000742 wd 0.0500 time 0.7025 (0.5902) data time 0.0009 (0.0020) model time 0.7017 (0.5900) loss 7.1498 (7.6128) grad_norm 2.0130 (2.4271) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:15:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][310/625] eta 0:03:06 lr 0.000741 wd 0.0500 time 0.5699 (0.5919) data time 0.0006 (0.0020) model time 0.5693 (0.5919) loss 5.8773 (7.6029) grad_norm 2.9737 (2.4244) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:15:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][320/625] eta 0:03:00 lr 0.000741 wd 0.0500 time 0.5739 (0.5917) data time 0.0008 (0.0019) model time 0.5731 (0.5916) loss 7.6614 (7.5934) grad_norm 2.0282 (2.4253) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:16:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][330/625] eta 0:02:54 lr 0.000741 wd 0.0500 time 0.5743 (0.5911) data time 0.0008 (0.0019) model time 0.5735 (0.5910) loss 8.0831 (7.5901) grad_norm 2.4231 (2.4198) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:16:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][340/625] eta 0:02:48 lr 0.000741 wd 0.0500 time 0.5696 (0.5906) data time 0.0009 (0.0019) model time 0.5687 (0.5903) loss 6.4852 (7.5898) grad_norm 2.0792 (2.4203) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:16:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][350/625] eta 0:02:42 lr 0.000741 wd 0.0500 time 0.5733 (0.5902) data time 0.0006 (0.0018) model time 0.5727 (0.5898) loss 7.6058 (7.5929) grad_norm 2.4549 (2.4147) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:16:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][360/625] eta 0:02:36 lr 0.000741 wd 0.0500 time 0.5762 (0.5898) data time 0.0006 (0.0018) model time 0.5756 (0.5893) loss 8.2368 (7.5861) grad_norm 2.7686 (2.4135) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:16:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][370/625] eta 0:02:30 lr 0.000741 wd 0.0500 time 0.5719 (0.5893) data time 0.0006 (0.0018) model time 0.5713 (0.5888) loss 6.4993 (7.5754) grad_norm 2.5274 (2.4148) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:16:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][380/625] eta 0:02:24 lr 0.000741 wd 0.0500 time 0.5719 (0.5889) data time 0.0008 (0.0018) model time 0.5711 (0.5883) loss 7.9409 (7.5742) grad_norm 1.8637 (2.4105) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:16:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][390/625] eta 0:02:18 lr 0.000741 wd 0.0500 time 0.5722 (0.5886) data time 0.0006 (0.0017) model time 0.5716 (0.5879) loss 8.1675 (7.5839) grad_norm 2.0729 (2.3963) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:16:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][400/625] eta 0:02:12 lr 0.000741 wd 0.0500 time 0.5744 (0.5883) data time 0.0008 (0.0017) model time 0.5736 (0.5875) loss 6.5916 (7.5776) grad_norm 1.9050 (2.3895) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:16:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][410/625] eta 0:02:06 lr 0.000740 wd 0.0500 time 0.5758 (0.5879) data time 0.0009 (0.0017) model time 0.5749 (0.5872) loss 7.9980 (7.5804) grad_norm 3.3897 (2.3890) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:16:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][420/625] eta 0:02:00 lr 0.000740 wd 0.0500 time 0.5772 (0.5877) data time 0.0007 (0.0017) model time 0.5766 (0.5869) loss 7.9922 (7.5936) grad_norm 3.2023 (2.3826) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:17:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][430/625] eta 0:01:54 lr 0.000740 wd 0.0500 time 0.5749 (0.5875) data time 0.0006 (0.0017) model time 0.5743 (0.5866) loss 5.6032 (7.5942) grad_norm 2.0176 (2.3783) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:17:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][440/625] eta 0:01:48 lr 0.000740 wd 0.0500 time 0.5632 (0.5872) data time 0.0008 (0.0016) model time 0.5624 (0.5863) loss 8.0484 (7.6041) grad_norm 1.9591 (2.3742) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:17:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][450/625] eta 0:01:42 lr 0.000740 wd 0.0500 time 0.5745 (0.5870) data time 0.0008 (0.0016) model time 0.5737 (0.5860) loss 7.6594 (7.5958) grad_norm 2.0782 (2.3744) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:17:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][460/625] eta 0:01:36 lr 0.000740 wd 0.0500 time 0.5721 (0.5871) data time 0.0006 (0.0016) model time 0.5714 (0.5861) loss 5.8671 (7.5959) grad_norm 2.1747 (2.3719) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:17:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][470/625] eta 0:01:30 lr 0.000740 wd 0.0500 time 0.5737 (0.5868) data time 0.0006 (0.0016) model time 0.5731 (0.5859) loss 6.8405 (7.5923) grad_norm 2.3416 (2.3611) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:17:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][480/625] eta 0:01:25 lr 0.000740 wd 0.0500 time 0.5638 (0.5866) data time 0.0009 (0.0016) model time 0.5629 (0.5856) loss 7.9653 (7.6017) grad_norm 2.8969 (2.3640) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:17:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][490/625] eta 0:01:19 lr 0.000740 wd 0.0500 time 0.5655 (0.5867) data time 0.0007 (0.0016) model time 0.5647 (0.5857) loss 7.9369 (7.6016) grad_norm 1.8701 (2.3655) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:17:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][500/625] eta 0:01:13 lr 0.000739 wd 0.0500 time 0.5691 (0.5873) data time 0.0006 (0.0015) model time 0.5685 (0.5864) loss 8.3597 (7.6024) grad_norm 3.1749 (2.3593) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:17:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][510/625] eta 0:01:07 lr 0.000739 wd 0.0500 time 0.7256 (0.5883) data time 0.0006 (0.0015) model time 0.7249 (0.5875) loss 7.9210 (7.6049) grad_norm 1.6321 (2.3557) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:17:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][520/625] eta 0:01:01 lr 0.000739 wd 0.0500 time 0.7591 (0.5891) data time 0.0006 (0.0015) model time 0.7585 (0.5885) loss 5.7822 (7.6010) grad_norm 1.9682 (2.3484) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:18:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][530/625] eta 0:00:56 lr 0.000739 wd 0.0500 time 0.5743 (0.5900) data time 0.0006 (0.0015) model time 0.5737 (0.5894) loss 9.8594 (7.6013) grad_norm 2.2899 (2.3413) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:18:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][540/625] eta 0:00:50 lr 0.000739 wd 0.0500 time 0.5715 (0.5900) data time 0.0008 (0.0015) model time 0.5707 (0.5894) loss 6.9066 (7.5961) grad_norm 2.8412 (2.3340) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:18:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][550/625] eta 0:00:44 lr 0.000739 wd 0.0500 time 0.5697 (0.5897) data time 0.0009 (0.0015) model time 0.5688 (0.5891) loss 8.6799 (7.6014) grad_norm 2.0300 (2.3354) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:18:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][560/625] eta 0:00:38 lr 0.000739 wd 0.0500 time 0.5692 (0.5894) data time 0.0008 (0.0015) model time 0.5684 (0.5888) loss 7.0091 (7.5951) grad_norm 3.0965 (2.3384) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:18:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][570/625] eta 0:00:32 lr 0.000739 wd 0.0500 time 0.5735 (0.5892) data time 0.0006 (0.0015) model time 0.5729 (0.5885) loss 6.6315 (7.5862) grad_norm 3.2431 (2.3394) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:18:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][580/625] eta 0:00:26 lr 0.000739 wd 0.0500 time 0.5725 (0.5889) data time 0.0010 (0.0014) model time 0.5716 (0.5882) loss 6.7710 (7.5874) grad_norm 2.0285 (2.3389) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:18:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][590/625] eta 0:00:20 lr 0.000739 wd 0.0500 time 0.5613 (0.5887) data time 0.0009 (0.0014) model time 0.5604 (0.5880) loss 6.0907 (7.5820) grad_norm 1.9952 (2.3297) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:18:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][600/625] eta 0:00:14 lr 0.000738 wd 0.0500 time 0.5738 (0.5885) data time 0.0006 (0.0014) model time 0.5732 (0.5878) loss 5.7807 (7.5735) grad_norm 4.0414 (2.3266) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:18:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][610/625] eta 0:00:08 lr 0.000738 wd 0.0500 time 0.5639 (0.5883) data time 0.0004 (0.0014) model time 0.5635 (0.5876) loss 7.8616 (7.5790) grad_norm 1.8033 (2.3241) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:18:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [139/300][620/625] eta 0:00:02 lr 0.000738 wd 0.0500 time 0.5711 (0.5881) data time 0.0006 (0.0014) model time 0.5706 (0.5873) loss 8.1538 (7.5807) grad_norm 4.3569 (2.3295) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:18:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 139 training takes 0:06:07 +[2024-07-25 04:18:55 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 04:18:56 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 04:18:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.479 (0.479) Loss 0.5122 (0.5122) Acc@1 88.770 (88.770) Acc@5 98.633 (98.633) Mem 22339MB +[2024-07-25 04:18:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.8428 (0.6559) Acc@1 80.664 (85.827) Acc@5 95.605 (97.625) Mem 22339MB +[2024-07-25 04:18:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.9673 (0.7723) Acc@1 75.732 (82.515) Acc@5 94.971 (96.387) Mem 22339MB +[2024-07-25 04:19:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.276 Acc@5 96.379 +[2024-07-25 04:19:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.3% +[2024-07-25 04:19:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.800 (0.800) Loss 0.4946 (0.4946) Acc@1 89.746 (89.746) Acc@5 98.730 (98.730) Mem 22339MB +[2024-07-25 04:19:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.187) Loss 0.7812 (0.6270) Acc@1 81.641 (86.532) Acc@5 96.289 (97.732) Mem 22339MB +[2024-07-25 04:19:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.158) Loss 0.9111 (0.7338) Acc@1 77.393 (83.315) Acc@5 95.361 (96.687) Mem 22339MB +[2024-07-25 04:19:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.973 Acc@5 96.683 +[2024-07-25 04:19:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.0% +[2024-07-25 04:19:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 82.97% +[2024-07-25 04:19:04 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 04:19:05 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 04:19:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][0/625] eta 0:09:12 lr 0.000738 wd 0.0500 time 0.8840 (0.8840) data time 0.3657 (0.3657) model time 0.0000 (0.0000) loss 7.8641 (7.8641) grad_norm 1.9138 (1.9138) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:19:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][10/625] eta 0:06:09 lr 0.000738 wd 0.0500 time 0.5729 (0.6015) data time 0.0007 (0.0344) model time 0.0000 (0.0000) loss 6.3771 (7.3595) grad_norm 2.3518 (2.0950) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:19:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][20/625] eta 0:05:57 lr 0.000738 wd 0.0500 time 0.5719 (0.5907) data time 0.0008 (0.0184) model time 0.0000 (0.0000) loss 7.5941 (7.4815) grad_norm 3.7117 (2.3890) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:19:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][30/625] eta 0:05:48 lr 0.000738 wd 0.0500 time 0.5644 (0.5861) data time 0.0008 (0.0127) model time 0.0000 (0.0000) loss 7.3448 (7.5315) grad_norm 2.5364 (2.4085) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:19:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][40/625] eta 0:05:41 lr 0.000738 wd 0.0500 time 0.5744 (0.5838) data time 0.0008 (0.0098) model time 0.0000 (0.0000) loss 7.3238 (7.6340) grad_norm 1.6762 (2.3373) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:19:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][50/625] eta 0:05:34 lr 0.000738 wd 0.0500 time 0.5718 (0.5824) data time 0.0008 (0.0082) model time 0.0000 (0.0000) loss 9.0196 (7.5208) grad_norm 3.0297 (2.3147) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:19:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][60/625] eta 0:05:28 lr 0.000738 wd 0.0500 time 0.5641 (0.5815) data time 0.0006 (0.0070) model time 0.5635 (0.5761) loss 7.0190 (7.5146) grad_norm 2.5383 (2.2966) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:19:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][70/625] eta 0:05:22 lr 0.000737 wd 0.0500 time 0.5766 (0.5805) data time 0.0008 (0.0061) model time 0.5757 (0.5748) loss 5.9499 (7.4569) grad_norm 2.3687 (2.2843) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:19:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][80/625] eta 0:05:16 lr 0.000737 wd 0.0500 time 0.5740 (0.5802) data time 0.0008 (0.0055) model time 0.5732 (0.5755) loss 9.3272 (7.4901) grad_norm 1.8790 (2.2421) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:19:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][90/625] eta 0:05:11 lr 0.000737 wd 0.0500 time 0.5760 (0.5826) data time 0.0006 (0.0049) model time 0.5754 (0.5820) loss 8.9025 (7.5445) grad_norm 2.6458 (2.2255) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:20:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][100/625] eta 0:05:08 lr 0.000737 wd 0.0500 time 0.5716 (0.5869) data time 0.0008 (0.0045) model time 0.5708 (0.5907) loss 7.5428 (7.5025) grad_norm 2.6608 (2.1995) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:20:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][110/625] eta 0:05:05 lr 0.000737 wd 0.0500 time 0.7360 (0.5929) data time 0.0007 (0.0042) model time 0.7354 (0.6009) loss 7.6012 (7.5116) grad_norm 1.9917 (2.2199) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:20:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][120/625] eta 0:05:01 lr 0.000737 wd 0.0500 time 0.7756 (0.5972) data time 0.0008 (0.0039) model time 0.7748 (0.6071) loss 8.0553 (7.5034) grad_norm 3.0785 (2.2292) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:20:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][130/625] eta 0:04:55 lr 0.000737 wd 0.0500 time 0.5696 (0.5973) data time 0.0007 (0.0037) model time 0.5689 (0.6059) loss 6.4279 (7.5037) grad_norm 2.0906 (2.2268) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:20:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][140/625] eta 0:04:49 lr 0.000737 wd 0.0500 time 0.5752 (0.5959) data time 0.0006 (0.0035) model time 0.5746 (0.6027) loss 7.5811 (7.5378) grad_norm 1.7926 (2.2055) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:20:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][150/625] eta 0:04:42 lr 0.000737 wd 0.0500 time 0.5719 (0.5945) data time 0.0006 (0.0033) model time 0.5712 (0.5998) loss 6.4889 (7.5167) grad_norm 2.3824 (2.2071) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:20:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][160/625] eta 0:04:35 lr 0.000737 wd 0.0500 time 0.5712 (0.5932) data time 0.0006 (0.0032) model time 0.5706 (0.5974) loss 7.3592 (7.5089) grad_norm 2.7932 (2.1927) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:20:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][170/625] eta 0:04:29 lr 0.000736 wd 0.0500 time 0.5724 (0.5921) data time 0.0007 (0.0030) model time 0.5717 (0.5954) loss 8.9358 (7.5037) grad_norm 2.8303 (2.1948) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:20:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][180/625] eta 0:04:23 lr 0.000736 wd 0.0500 time 0.5719 (0.5911) data time 0.0006 (0.0029) model time 0.5713 (0.5937) loss 6.0364 (7.5194) grad_norm 2.6136 (2.1827) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:20:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][190/625] eta 0:04:16 lr 0.000736 wd 0.0500 time 0.5694 (0.5903) data time 0.0009 (0.0028) model time 0.5685 (0.5923) loss 9.8079 (7.5361) grad_norm 1.9928 (2.2029) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:21:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][200/625] eta 0:04:10 lr 0.000736 wd 0.0500 time 0.5739 (0.5895) data time 0.0006 (0.0027) model time 0.5732 (0.5911) loss 7.3354 (7.5307) grad_norm 3.1013 (2.2397) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:21:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][210/625] eta 0:04:04 lr 0.000736 wd 0.0500 time 0.5706 (0.5889) data time 0.0006 (0.0026) model time 0.5700 (0.5901) loss 7.7797 (7.5290) grad_norm 2.2838 (2.2280) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:21:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][220/625] eta 0:03:58 lr 0.000736 wd 0.0500 time 0.5729 (0.5889) data time 0.0008 (0.0025) model time 0.5720 (0.5901) loss 6.9580 (7.5202) grad_norm 1.8114 (2.2165) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:21:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][230/625] eta 0:03:52 lr 0.000736 wd 0.0500 time 0.5743 (0.5883) data time 0.0008 (0.0024) model time 0.5734 (0.5892) loss 8.3356 (7.5305) grad_norm 1.5907 (2.2056) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:21:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][240/625] eta 0:03:46 lr 0.000736 wd 0.0500 time 0.5750 (0.5878) data time 0.0008 (0.0024) model time 0.5742 (0.5884) loss 9.0715 (7.5326) grad_norm 1.9470 (2.2102) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:21:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][250/625] eta 0:03:40 lr 0.000736 wd 0.0500 time 0.5743 (0.5874) data time 0.0008 (0.0023) model time 0.5736 (0.5878) loss 7.9259 (7.5371) grad_norm 3.1618 (2.2144) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:21:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][260/625] eta 0:03:34 lr 0.000735 wd 0.0500 time 0.5727 (0.5869) data time 0.0007 (0.0023) model time 0.5720 (0.5872) loss 8.3338 (7.5343) grad_norm 2.2547 (2.2105) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:21:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][270/625] eta 0:03:28 lr 0.000735 wd 0.0500 time 0.5705 (0.5865) data time 0.0007 (0.0022) model time 0.5698 (0.5866) loss 8.6671 (7.5433) grad_norm 2.1582 (2.2094) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:21:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][280/625] eta 0:03:22 lr 0.000735 wd 0.0500 time 0.5747 (0.5862) data time 0.0006 (0.0021) model time 0.5741 (0.5862) loss 7.0250 (7.5502) grad_norm 1.9633 (2.2105) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:21:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][290/625] eta 0:03:16 lr 0.000735 wd 0.0500 time 0.5738 (0.5859) data time 0.0006 (0.0021) model time 0.5732 (0.5858) loss 8.7723 (7.5535) grad_norm 3.5093 (2.2156) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:22:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][300/625] eta 0:03:10 lr 0.000735 wd 0.0500 time 0.5712 (0.5855) data time 0.0008 (0.0021) model time 0.5704 (0.5853) loss 5.8094 (7.5370) grad_norm 3.4872 (2.2273) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:22:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][310/625] eta 0:03:04 lr 0.000735 wd 0.0500 time 0.7077 (0.5865) data time 0.0008 (0.0020) model time 0.7069 (0.5864) loss 8.8158 (7.5222) grad_norm 2.4468 (2.2242) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:22:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][320/625] eta 0:02:59 lr 0.000735 wd 0.0500 time 0.7728 (0.5879) data time 0.0008 (0.0020) model time 0.7720 (0.5881) loss 8.5747 (7.5309) grad_norm 2.1309 (2.2238) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:22:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][330/625] eta 0:02:53 lr 0.000735 wd 0.0500 time 0.6965 (0.5895) data time 0.0007 (0.0020) model time 0.6958 (0.5899) loss 7.1832 (7.5452) grad_norm 1.9940 (2.2475) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:22:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][340/625] eta 0:02:48 lr 0.000735 wd 0.0500 time 0.5673 (0.5907) data time 0.0007 (0.0019) model time 0.5667 (0.5913) loss 8.6282 (7.5423) grad_norm 1.9820 (2.2467) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:22:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][350/625] eta 0:02:42 lr 0.000735 wd 0.0500 time 0.5739 (0.5910) data time 0.0006 (0.0019) model time 0.5733 (0.5917) loss 7.5640 (7.5376) grad_norm 4.5050 (2.2556) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:22:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][360/625] eta 0:02:36 lr 0.000734 wd 0.0500 time 0.5721 (0.5906) data time 0.0006 (0.0019) model time 0.5715 (0.5911) loss 8.2280 (7.5291) grad_norm 1.8832 (2.2644) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:22:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][370/625] eta 0:02:30 lr 0.000734 wd 0.0500 time 0.5676 (0.5901) data time 0.0007 (0.0018) model time 0.5668 (0.5905) loss 7.9333 (7.5320) grad_norm 3.6406 (2.2572) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:22:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][380/625] eta 0:02:24 lr 0.000734 wd 0.0500 time 0.5725 (0.5898) data time 0.0008 (0.0018) model time 0.5717 (0.5901) loss 8.4448 (7.5253) grad_norm 1.8661 (2.2533) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:22:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][390/625] eta 0:02:18 lr 0.000734 wd 0.0500 time 0.5718 (0.5894) data time 0.0006 (0.0018) model time 0.5712 (0.5896) loss 6.8415 (7.5308) grad_norm 2.2127 (2.2606) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:23:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][400/625] eta 0:02:12 lr 0.000734 wd 0.0500 time 0.5731 (0.5890) data time 0.0008 (0.0018) model time 0.5724 (0.5892) loss 7.3242 (7.5363) grad_norm 1.5862 (2.2595) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:23:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][410/625] eta 0:02:06 lr 0.000734 wd 0.0500 time 0.5712 (0.5887) data time 0.0006 (0.0017) model time 0.5706 (0.5888) loss 8.5655 (7.5353) grad_norm 2.2875 (2.2594) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:23:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][420/625] eta 0:02:00 lr 0.000734 wd 0.0500 time 0.5728 (0.5885) data time 0.0008 (0.0017) model time 0.5721 (0.5885) loss 6.7428 (7.5391) grad_norm 2.5596 (2.2592) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:23:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][430/625] eta 0:01:54 lr 0.000734 wd 0.0500 time 0.5737 (0.5882) data time 0.0007 (0.0017) model time 0.5730 (0.5881) loss 7.6746 (7.5476) grad_norm 2.2962 (2.2679) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:23:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][440/625] eta 0:01:48 lr 0.000734 wd 0.0500 time 0.5762 (0.5881) data time 0.0009 (0.0017) model time 0.5753 (0.5880) loss 7.5216 (7.5430) grad_norm 1.9503 (2.2612) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:23:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][450/625] eta 0:01:42 lr 0.000733 wd 0.0500 time 0.5723 (0.5878) data time 0.0006 (0.0017) model time 0.5718 (0.5876) loss 6.9421 (7.5429) grad_norm 1.8761 (2.2548) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:23:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][460/625] eta 0:01:36 lr 0.000733 wd 0.0500 time 0.5744 (0.5875) data time 0.0009 (0.0016) model time 0.5735 (0.5873) loss 6.6139 (7.5456) grad_norm 1.6082 (2.2475) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:23:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][470/625] eta 0:01:31 lr 0.000733 wd 0.0500 time 0.5639 (0.5872) data time 0.0007 (0.0016) model time 0.5632 (0.5870) loss 8.6603 (7.5479) grad_norm 2.3203 (2.2431) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:23:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][480/625] eta 0:01:25 lr 0.000733 wd 0.0500 time 0.5721 (0.5870) data time 0.0006 (0.0016) model time 0.5715 (0.5867) loss 7.5716 (7.5501) grad_norm 2.8090 (2.2420) loss_scale 4096.0000 (2056.5156) mem 22339MB +[2024-07-25 04:23:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][490/625] eta 0:01:19 lr 0.000733 wd 0.0500 time 0.5729 (0.5867) data time 0.0006 (0.0016) model time 0.5723 (0.5864) loss 6.8964 (7.5397) grad_norm 2.2277 (2.2409) loss_scale 4096.0000 (2098.0530) mem 22339MB +[2024-07-25 04:23:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][500/625] eta 0:01:13 lr 0.000733 wd 0.0500 time 0.5740 (0.5865) data time 0.0008 (0.0016) model time 0.5732 (0.5862) loss 7.8616 (7.5365) grad_norm 2.7124 (2.2468) loss_scale 4096.0000 (2137.9321) mem 22339MB +[2024-07-25 04:24:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][510/625] eta 0:01:07 lr 0.000733 wd 0.0500 time 0.5706 (0.5863) data time 0.0008 (0.0016) model time 0.5698 (0.5860) loss 7.7602 (7.5373) grad_norm 2.2576 (2.2531) loss_scale 4096.0000 (2176.2505) mem 22339MB +[2024-07-25 04:24:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][520/625] eta 0:01:01 lr 0.000733 wd 0.0500 time 0.5747 (0.5861) data time 0.0008 (0.0015) model time 0.5739 (0.5857) loss 7.6749 (7.5452) grad_norm 1.7900 (2.2514) loss_scale 4096.0000 (2213.0979) mem 22339MB +[2024-07-25 04:24:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][530/625] eta 0:00:55 lr 0.000733 wd 0.0500 time 0.5747 (0.5864) data time 0.0006 (0.0015) model time 0.5742 (0.5860) loss 7.1548 (7.5427) grad_norm 2.4589 (2.2559) loss_scale 4096.0000 (2248.5574) mem 22339MB +[2024-07-25 04:24:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][540/625] eta 0:00:49 lr 0.000733 wd 0.0500 time 0.6802 (0.5870) data time 0.0007 (0.0015) model time 0.6795 (0.5866) loss 6.8073 (7.5445) grad_norm 2.9377 (2.2578) loss_scale 4096.0000 (2282.7061) mem 22339MB +[2024-07-25 04:24:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][550/625] eta 0:00:44 lr 0.000732 wd 0.0500 time 0.7416 (0.5880) data time 0.0007 (0.0015) model time 0.7410 (0.5877) loss 8.0313 (7.5542) grad_norm 1.9586 (2.2592) loss_scale 4096.0000 (2315.6152) mem 22339MB +[2024-07-25 04:24:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][560/625] eta 0:00:38 lr 0.000732 wd 0.0500 time 0.7219 (0.5889) data time 0.0008 (0.0015) model time 0.7211 (0.5888) loss 8.3344 (7.5563) grad_norm 2.7262 (2.2585) loss_scale 4096.0000 (2347.3512) mem 22339MB +[2024-07-25 04:24:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][570/625] eta 0:00:32 lr 0.000732 wd 0.0500 time 0.5722 (0.5894) data time 0.0006 (0.0015) model time 0.5716 (0.5892) loss 8.6359 (7.5566) grad_norm 2.1697 (2.2563) loss_scale 4096.0000 (2377.9755) mem 22339MB +[2024-07-25 04:24:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][580/625] eta 0:00:26 lr 0.000732 wd 0.0500 time 0.5713 (0.5891) data time 0.0006 (0.0015) model time 0.5707 (0.5889) loss 8.1553 (7.5641) grad_norm 3.2005 (2.2572) loss_scale 4096.0000 (2407.5456) mem 22339MB +[2024-07-25 04:24:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][590/625] eta 0:00:20 lr 0.000732 wd 0.0500 time 0.5722 (0.5889) data time 0.0006 (0.0014) model time 0.5716 (0.5887) loss 6.7033 (7.5570) grad_norm 1.9810 (2.2620) loss_scale 4096.0000 (2436.1151) mem 22339MB +[2024-07-25 04:24:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][600/625] eta 0:00:14 lr 0.000732 wd 0.0500 time 0.5728 (0.5886) data time 0.0006 (0.0014) model time 0.5722 (0.5884) loss 6.6154 (7.5617) grad_norm 2.1677 (2.2601) loss_scale 4096.0000 (2463.7338) mem 22339MB +[2024-07-25 04:25:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][610/625] eta 0:00:08 lr 0.000732 wd 0.0500 time 0.5722 (0.5884) data time 0.0004 (0.0014) model time 0.5718 (0.5881) loss 7.6092 (7.5621) grad_norm 1.6803 (2.2685) loss_scale 4096.0000 (2490.4484) mem 22339MB +[2024-07-25 04:25:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [140/300][620/625] eta 0:00:02 lr 0.000732 wd 0.0500 time 0.5691 (0.5882) data time 0.0004 (0.0014) model time 0.5687 (0.5879) loss 7.1987 (7.5606) grad_norm 2.0228 (2.2692) loss_scale 4096.0000 (2516.3027) mem 22339MB +[2024-07-25 04:25:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 140 training takes 0:06:07 +[2024-07-25 04:25:13 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 04:25:14 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 04:25:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.493 (0.493) Loss 0.5239 (0.5239) Acc@1 88.818 (88.818) Acc@5 98.779 (98.779) Mem 22339MB +[2024-07-25 04:25:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.159) Loss 0.8662 (0.6606) Acc@1 80.029 (85.804) Acc@5 95.703 (97.496) Mem 22339MB +[2024-07-25 04:25:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.143) Loss 0.9575 (0.7709) Acc@1 77.295 (82.673) Acc@5 94.482 (96.324) Mem 22339MB +[2024-07-25 04:25:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.348 Acc@5 96.269 +[2024-07-25 04:25:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.3% +[2024-07-25 04:25:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.806 (0.806) Loss 0.4941 (0.4941) Acc@1 89.746 (89.746) Acc@5 98.779 (98.779) Mem 22339MB +[2024-07-25 04:25:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.188) Loss 0.7798 (0.6268) Acc@1 81.836 (86.590) Acc@5 96.240 (97.727) Mem 22339MB +[2024-07-25 04:25:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.158) Loss 0.9106 (0.7332) Acc@1 77.393 (83.361) Acc@5 95.312 (96.680) Mem 22339MB +[2024-07-25 04:25:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.011 Acc@5 96.679 +[2024-07-25 04:25:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.0% +[2024-07-25 04:25:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.01% +[2024-07-25 04:25:22 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 04:25:23 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 04:25:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][0/625] eta 0:09:09 lr 0.000732 wd 0.0500 time 0.8792 (0.8792) data time 0.3628 (0.3628) model time 0.0000 (0.0000) loss 8.2970 (8.2970) grad_norm 2.5834 (2.5834) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:25:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][10/625] eta 0:06:10 lr 0.000732 wd 0.0500 time 0.5759 (0.6026) data time 0.0007 (0.0337) model time 0.0000 (0.0000) loss 7.1315 (7.4746) grad_norm 3.0444 (2.8461) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:25:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][20/625] eta 0:05:56 lr 0.000731 wd 0.0500 time 0.5724 (0.5897) data time 0.0006 (0.0181) model time 0.0000 (0.0000) loss 6.4055 (7.3186) grad_norm 3.8634 (2.9059) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:25:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][30/625] eta 0:05:48 lr 0.000731 wd 0.0500 time 0.5698 (0.5851) data time 0.0006 (0.0125) model time 0.0000 (0.0000) loss 7.4403 (7.4685) grad_norm 1.7834 (2.7196) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:25:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][40/625] eta 0:05:40 lr 0.000731 wd 0.0500 time 0.5727 (0.5828) data time 0.0009 (0.0096) model time 0.0000 (0.0000) loss 8.2195 (7.5861) grad_norm 2.5589 (2.5885) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:25:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][50/625] eta 0:05:34 lr 0.000731 wd 0.0500 time 0.5732 (0.5816) data time 0.0006 (0.0079) model time 0.0000 (0.0000) loss 7.0296 (7.5800) grad_norm 1.7153 (2.4831) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:25:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][60/625] eta 0:05:28 lr 0.000731 wd 0.0500 time 0.5730 (0.5806) data time 0.0008 (0.0067) model time 0.5722 (0.5749) loss 6.8694 (7.5463) grad_norm 1.9928 (2.4367) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:26:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][70/625] eta 0:05:21 lr 0.000731 wd 0.0500 time 0.5719 (0.5800) data time 0.0008 (0.0059) model time 0.5711 (0.5752) loss 7.0365 (7.5595) grad_norm 2.3031 (2.4282) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:26:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][80/625] eta 0:05:15 lr 0.000731 wd 0.0500 time 0.5736 (0.5794) data time 0.0006 (0.0053) model time 0.5730 (0.5750) loss 6.5474 (7.5504) grad_norm 2.1237 (2.4092) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:26:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][90/625] eta 0:05:09 lr 0.000731 wd 0.0500 time 0.5634 (0.5790) data time 0.0006 (0.0048) model time 0.5628 (0.5749) loss 5.8123 (7.5215) grad_norm 3.5331 (2.4006) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:26:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][100/625] eta 0:05:03 lr 0.000731 wd 0.0500 time 0.5773 (0.5787) data time 0.0007 (0.0044) model time 0.5767 (0.5750) loss 6.2424 (7.5097) grad_norm 2.7457 (2.4287) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:26:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][110/625] eta 0:04:57 lr 0.000731 wd 0.0500 time 0.5719 (0.5783) data time 0.0008 (0.0041) model time 0.5711 (0.5747) loss 8.8651 (7.4718) grad_norm 1.7095 (2.3890) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:26:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][120/625] eta 0:04:52 lr 0.000730 wd 0.0500 time 0.7270 (0.5792) data time 0.0006 (0.0038) model time 0.7264 (0.5767) loss 7.5469 (7.4721) grad_norm 2.0987 (2.3677) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:26:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][130/625] eta 0:04:46 lr 0.000730 wd 0.0500 time 0.5625 (0.5797) data time 0.0009 (0.0036) model time 0.5617 (0.5776) loss 7.1760 (7.4607) grad_norm 1.4799 (2.3387) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:26:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][140/625] eta 0:04:42 lr 0.000730 wd 0.0500 time 0.5754 (0.5825) data time 0.0010 (0.0034) model time 0.5744 (0.5822) loss 7.7078 (7.4577) grad_norm 2.1763 (2.3337) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:26:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][150/625] eta 0:04:38 lr 0.000730 wd 0.0500 time 0.7162 (0.5857) data time 0.0007 (0.0032) model time 0.7154 (0.5871) loss 6.4992 (7.4437) grad_norm 2.5260 (2.3439) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:26:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][160/625] eta 0:04:34 lr 0.000730 wd 0.0500 time 0.7306 (0.5902) data time 0.0006 (0.0031) model time 0.7300 (0.5934) loss 6.9263 (7.4333) grad_norm 3.6643 (2.3602) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:27:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][170/625] eta 0:04:28 lr 0.000730 wd 0.0500 time 0.5679 (0.5900) data time 0.0008 (0.0029) model time 0.5671 (0.5928) loss 8.2451 (7.4167) grad_norm 1.8858 (2.3896) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:27:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][180/625] eta 0:04:22 lr 0.000730 wd 0.0500 time 0.5689 (0.5898) data time 0.0006 (0.0032) model time 0.5682 (0.5917) loss 7.8607 (7.4149) grad_norm 2.2558 (2.4037) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:27:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][190/625] eta 0:04:16 lr 0.000730 wd 0.0500 time 0.5689 (0.5891) data time 0.0009 (0.0030) model time 0.5680 (0.5905) loss 8.4588 (7.4412) grad_norm 2.5848 (2.4017) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:27:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][200/625] eta 0:04:10 lr 0.000730 wd 0.0500 time 0.5696 (0.5886) data time 0.0009 (0.0029) model time 0.5688 (0.5898) loss 7.5274 (7.4478) grad_norm 2.2358 (2.4012) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:27:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][210/625] eta 0:04:04 lr 0.000729 wd 0.0500 time 0.5635 (0.5884) data time 0.0008 (0.0028) model time 0.5626 (0.5894) loss 8.2783 (7.4589) grad_norm 2.0115 (2.3851) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:27:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][220/625] eta 0:03:58 lr 0.000729 wd 0.0500 time 0.5658 (0.5879) data time 0.0006 (0.0027) model time 0.5652 (0.5886) loss 6.9339 (7.4623) grad_norm 1.8260 (2.3715) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:27:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][230/625] eta 0:03:52 lr 0.000729 wd 0.0500 time 0.5659 (0.5874) data time 0.0006 (0.0027) model time 0.5653 (0.5879) loss 8.3217 (7.4825) grad_norm 2.0805 (2.3794) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:27:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][240/625] eta 0:03:46 lr 0.000729 wd 0.0500 time 0.5715 (0.5870) data time 0.0006 (0.0026) model time 0.5709 (0.5873) loss 8.4390 (7.4888) grad_norm 2.1390 (2.3922) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:27:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][250/625] eta 0:03:39 lr 0.000729 wd 0.0500 time 0.5721 (0.5866) data time 0.0008 (0.0025) model time 0.5712 (0.5867) loss 6.5351 (7.4961) grad_norm 2.3867 (2.4043) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:27:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][260/625] eta 0:03:34 lr 0.000729 wd 0.0500 time 0.5659 (0.5863) data time 0.0007 (0.0025) model time 0.5651 (0.5864) loss 6.2946 (7.4955) grad_norm 1.9979 (2.4101) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:28:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][270/625] eta 0:03:28 lr 0.000729 wd 0.0500 time 0.5665 (0.5860) data time 0.0009 (0.0024) model time 0.5656 (0.5860) loss 8.1054 (7.5118) grad_norm 1.8827 (2.4169) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:28:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][280/625] eta 0:03:22 lr 0.000729 wd 0.0500 time 0.5724 (0.5857) data time 0.0008 (0.0023) model time 0.5716 (0.5856) loss 8.7189 (7.5324) grad_norm 1.8233 (2.4059) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:28:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][290/625] eta 0:03:16 lr 0.000729 wd 0.0500 time 0.5794 (0.5854) data time 0.0008 (0.0023) model time 0.5786 (0.5852) loss 8.4553 (7.5357) grad_norm 2.3401 (2.4070) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:28:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][300/625] eta 0:03:10 lr 0.000729 wd 0.0500 time 0.5743 (0.5851) data time 0.0008 (0.0022) model time 0.5735 (0.5847) loss 7.7504 (7.5404) grad_norm 2.0105 (2.4026) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:28:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][310/625] eta 0:03:04 lr 0.000728 wd 0.0500 time 0.5721 (0.5847) data time 0.0006 (0.0022) model time 0.5715 (0.5843) loss 7.9088 (7.5358) grad_norm 2.6716 (2.3940) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:28:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][320/625] eta 0:02:58 lr 0.000728 wd 0.0500 time 0.5704 (0.5844) data time 0.0006 (0.0021) model time 0.5697 (0.5839) loss 7.2970 (7.5366) grad_norm 2.6911 (2.3975) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:28:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][330/625] eta 0:02:52 lr 0.000728 wd 0.0500 time 0.5754 (0.5841) data time 0.0006 (0.0021) model time 0.5748 (0.5835) loss 7.8258 (7.5407) grad_norm 3.1450 (2.3922) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:28:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][340/625] eta 0:02:46 lr 0.000728 wd 0.0500 time 0.5729 (0.5839) data time 0.0008 (0.0021) model time 0.5722 (0.5833) loss 8.3501 (7.5493) grad_norm 2.0128 (2.3853) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:28:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][350/625] eta 0:02:40 lr 0.000728 wd 0.0500 time 0.5724 (0.5844) data time 0.0006 (0.0020) model time 0.5718 (0.5838) loss 7.3482 (7.5552) grad_norm 2.6330 (2.3810) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:28:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][360/625] eta 0:02:35 lr 0.000728 wd 0.0500 time 0.7510 (0.5864) data time 0.0007 (0.0020) model time 0.7504 (0.5861) loss 8.1980 (7.5519) grad_norm 1.3918 (2.3718) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:29:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][370/625] eta 0:02:29 lr 0.000728 wd 0.0500 time 0.7523 (0.5879) data time 0.0006 (0.0020) model time 0.7516 (0.5879) loss 7.4297 (7.5536) grad_norm 3.8303 (2.3677) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:29:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][380/625] eta 0:02:24 lr 0.000728 wd 0.0500 time 0.6108 (0.5897) data time 0.0006 (0.0019) model time 0.6102 (0.5899) loss 7.5822 (7.5548) grad_norm 1.5304 (2.3657) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:29:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][390/625] eta 0:02:18 lr 0.000728 wd 0.0500 time 0.5622 (0.5896) data time 0.0006 (0.0019) model time 0.5617 (0.5898) loss 8.5519 (7.5513) grad_norm 2.7173 (2.3755) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:29:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][400/625] eta 0:02:12 lr 0.000727 wd 0.0500 time 0.5719 (0.5892) data time 0.0007 (0.0019) model time 0.5712 (0.5893) loss 7.3878 (7.5473) grad_norm 2.0752 (2.3823) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:29:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][410/625] eta 0:02:06 lr 0.000727 wd 0.0500 time 0.5717 (0.5889) data time 0.0006 (0.0019) model time 0.5711 (0.5889) loss 6.4005 (7.5531) grad_norm 1.6867 (2.3719) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:29:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][420/625] eta 0:02:00 lr 0.000727 wd 0.0500 time 0.5613 (0.5886) data time 0.0006 (0.0018) model time 0.5606 (0.5886) loss 7.5663 (7.5477) grad_norm 2.4601 (2.3671) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:29:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][430/625] eta 0:01:54 lr 0.000727 wd 0.0500 time 0.5745 (0.5885) data time 0.0006 (0.0018) model time 0.5739 (0.5884) loss 8.6904 (7.5575) grad_norm 1.9634 (2.3584) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:29:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][440/625] eta 0:01:48 lr 0.000727 wd 0.0500 time 0.5727 (0.5885) data time 0.0008 (0.0018) model time 0.5719 (0.5884) loss 8.2783 (7.5589) grad_norm 2.9686 (2.3630) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 04:29:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][450/625] eta 0:01:42 lr 0.000727 wd 0.0500 time 0.5737 (0.5882) data time 0.0006 (0.0018) model time 0.5732 (0.5880) loss 6.2182 (7.5584) grad_norm 2.1233 (inf) loss_scale 2048.0000 (4073.2949) mem 22339MB +[2024-07-25 04:29:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][460/625] eta 0:01:37 lr 0.000727 wd 0.0500 time 0.5765 (0.5879) data time 0.0009 (0.0018) model time 0.5757 (0.5877) loss 8.5415 (7.5671) grad_norm 2.3268 (inf) loss_scale 2048.0000 (4029.3623) mem 22339MB +[2024-07-25 04:30:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][470/625] eta 0:01:31 lr 0.000727 wd 0.0500 time 0.5727 (0.5877) data time 0.0008 (0.0017) model time 0.5719 (0.5874) loss 6.6915 (7.5688) grad_norm 3.9624 (inf) loss_scale 2048.0000 (3987.2951) mem 22339MB +[2024-07-25 04:30:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][480/625] eta 0:01:25 lr 0.000727 wd 0.0500 time 0.5623 (0.5876) data time 0.0006 (0.0017) model time 0.5617 (0.5874) loss 6.1004 (7.5640) grad_norm 2.3308 (inf) loss_scale 2048.0000 (3946.9771) mem 22339MB +[2024-07-25 04:30:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][490/625] eta 0:01:19 lr 0.000727 wd 0.0500 time 0.5632 (0.5874) data time 0.0006 (0.0017) model time 0.5626 (0.5871) loss 7.0389 (7.5594) grad_norm 1.8632 (inf) loss_scale 2048.0000 (3908.3014) mem 22339MB +[2024-07-25 04:30:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][500/625] eta 0:01:13 lr 0.000726 wd 0.0500 time 0.5759 (0.5872) data time 0.0008 (0.0017) model time 0.5752 (0.5869) loss 7.2334 (7.5620) grad_norm 2.5322 (inf) loss_scale 2048.0000 (3871.1697) mem 22339MB +[2024-07-25 04:30:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][510/625] eta 0:01:07 lr 0.000726 wd 0.0500 time 0.5724 (0.5870) data time 0.0007 (0.0017) model time 0.5717 (0.5866) loss 6.8803 (7.5647) grad_norm 1.8336 (inf) loss_scale 2048.0000 (3835.4912) mem 22339MB +[2024-07-25 04:30:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][520/625] eta 0:01:01 lr 0.000726 wd 0.0500 time 0.5671 (0.5868) data time 0.0008 (0.0016) model time 0.5664 (0.5864) loss 6.3447 (7.5666) grad_norm 2.8332 (inf) loss_scale 2048.0000 (3801.1823) mem 22339MB +[2024-07-25 04:30:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][530/625] eta 0:00:55 lr 0.000726 wd 0.0500 time 0.5724 (0.5866) data time 0.0008 (0.0016) model time 0.5716 (0.5862) loss 8.8686 (7.5720) grad_norm 2.0325 (inf) loss_scale 2048.0000 (3768.1657) mem 22339MB +[2024-07-25 04:30:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][540/625] eta 0:00:49 lr 0.000726 wd 0.0500 time 0.5744 (0.5864) data time 0.0008 (0.0016) model time 0.5736 (0.5859) loss 7.2321 (7.5658) grad_norm 3.6748 (inf) loss_scale 2048.0000 (3736.3697) mem 22339MB +[2024-07-25 04:30:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][550/625] eta 0:00:43 lr 0.000726 wd 0.0500 time 0.5722 (0.5862) data time 0.0006 (0.0016) model time 0.5716 (0.5857) loss 7.0069 (7.5649) grad_norm 1.8309 (inf) loss_scale 2048.0000 (3705.7278) mem 22339MB +[2024-07-25 04:30:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][560/625] eta 0:00:38 lr 0.000726 wd 0.0500 time 0.5635 (0.5860) data time 0.0006 (0.0016) model time 0.5629 (0.5854) loss 7.2243 (7.5651) grad_norm 1.7410 (inf) loss_scale 2048.0000 (3676.1783) mem 22339MB +[2024-07-25 04:30:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][570/625] eta 0:00:32 lr 0.000726 wd 0.0500 time 0.5720 (0.5863) data time 0.0006 (0.0016) model time 0.5713 (0.5858) loss 8.8532 (7.5737) grad_norm 2.2106 (inf) loss_scale 2048.0000 (3647.6637) mem 22339MB +[2024-07-25 04:31:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][580/625] eta 0:00:26 lr 0.000726 wd 0.0500 time 0.7504 (0.5872) data time 0.0006 (0.0016) model time 0.7498 (0.5868) loss 6.4714 (7.5589) grad_norm 1.8304 (inf) loss_scale 2048.0000 (3620.1308) mem 22339MB +[2024-07-25 04:31:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][590/625] eta 0:00:20 lr 0.000726 wd 0.0500 time 0.5732 (0.5878) data time 0.0006 (0.0015) model time 0.5726 (0.5875) loss 6.9211 (7.5591) grad_norm 2.0454 (inf) loss_scale 2048.0000 (3593.5296) mem 22339MB +[2024-07-25 04:31:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][600/625] eta 0:00:14 lr 0.000725 wd 0.0500 time 0.7336 (0.5890) data time 0.0006 (0.0015) model time 0.7330 (0.5887) loss 7.4832 (7.5578) grad_norm 1.7343 (inf) loss_scale 2048.0000 (3567.8136) mem 22339MB +[2024-07-25 04:31:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][610/625] eta 0:00:08 lr 0.000725 wd 0.0500 time 0.5720 (0.5890) data time 0.0004 (0.0015) model time 0.5716 (0.5887) loss 7.7012 (7.5547) grad_norm 2.1700 (inf) loss_scale 2048.0000 (3542.9394) mem 22339MB +[2024-07-25 04:31:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [141/300][620/625] eta 0:00:02 lr 0.000725 wd 0.0500 time 0.5719 (0.5887) data time 0.0006 (0.0015) model time 0.5713 (0.5884) loss 7.9000 (7.5495) grad_norm 3.5976 (inf) loss_scale 2048.0000 (3518.8663) mem 22339MB +[2024-07-25 04:31:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 141 training takes 0:06:07 +[2024-07-25 04:31:31 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 04:31:33 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 04:31:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.488 (0.488) Loss 0.5156 (0.5156) Acc@1 88.672 (88.672) Acc@5 98.584 (98.584) Mem 22339MB +[2024-07-25 04:31:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.8115 (0.6443) Acc@1 80.518 (85.875) Acc@5 96.191 (97.630) Mem 22339MB +[2024-07-25 04:31:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.143) Loss 0.9424 (0.7589) Acc@1 77.246 (82.636) Acc@5 94.922 (96.375) Mem 22339MB +[2024-07-25 04:31:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.250 Acc@5 96.337 +[2024-07-25 04:31:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.2% +[2024-07-25 04:31:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.975 (0.975) Loss 0.4946 (0.4946) Acc@1 89.697 (89.697) Acc@5 98.828 (98.828) Mem 22339MB +[2024-07-25 04:31:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.203) Loss 0.7788 (0.6262) Acc@1 81.738 (86.572) Acc@5 96.240 (97.732) Mem 22339MB +[2024-07-25 04:31:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.166) Loss 0.9087 (0.7324) Acc@1 77.441 (83.357) Acc@5 95.312 (96.687) Mem 22339MB +[2024-07-25 04:31:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.009 Acc@5 96.679 +[2024-07-25 04:31:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.0% +[2024-07-25 04:31:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][0/625] eta 0:14:04 lr 0.000725 wd 0.0500 time 1.3519 (1.3519) data time 0.5262 (0.5262) model time 0.0000 (0.0000) loss 8.5211 (8.5211) grad_norm 2.8200 (2.8200) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:31:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][10/625] eta 0:06:35 lr 0.000725 wd 0.0500 time 0.5723 (0.6434) data time 0.0008 (0.0486) model time 0.0000 (0.0000) loss 6.5679 (7.1934) grad_norm 2.7415 (4.2534) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:31:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][20/625] eta 0:06:09 lr 0.000725 wd 0.0500 time 0.5757 (0.6106) data time 0.0007 (0.0259) model time 0.0000 (0.0000) loss 6.2368 (7.2281) grad_norm 2.3342 (3.6010) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:31:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][30/625] eta 0:05:56 lr 0.000725 wd 0.0500 time 0.5751 (0.5985) data time 0.0008 (0.0178) model time 0.0000 (0.0000) loss 7.6521 (7.3626) grad_norm 1.5876 (3.2054) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:32:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][40/625] eta 0:05:46 lr 0.000725 wd 0.0500 time 0.5712 (0.5923) data time 0.0008 (0.0136) model time 0.0000 (0.0000) loss 8.2541 (7.3218) grad_norm 2.5271 (2.9034) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:32:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][50/625] eta 0:05:38 lr 0.000725 wd 0.0500 time 0.5730 (0.5887) data time 0.0010 (0.0112) model time 0.0000 (0.0000) loss 6.3056 (7.3855) grad_norm 1.9288 (2.7463) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:32:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][60/625] eta 0:05:31 lr 0.000725 wd 0.0500 time 0.5741 (0.5864) data time 0.0008 (0.0095) model time 0.5734 (0.5737) loss 8.4852 (7.4229) grad_norm 3.0396 (2.6535) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:32:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][70/625] eta 0:05:24 lr 0.000724 wd 0.0500 time 0.5758 (0.5848) data time 0.0008 (0.0083) model time 0.5750 (0.5741) loss 8.3282 (7.5133) grad_norm 3.2670 (2.6422) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:32:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][80/625] eta 0:05:18 lr 0.000724 wd 0.0500 time 0.5740 (0.5835) data time 0.0006 (0.0073) model time 0.5734 (0.5739) loss 8.8048 (7.5942) grad_norm 1.8871 (2.5735) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:32:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][90/625] eta 0:05:11 lr 0.000724 wd 0.0500 time 0.5747 (0.5825) data time 0.0008 (0.0066) model time 0.5739 (0.5738) loss 7.2009 (7.5495) grad_norm 4.6346 (2.5750) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:32:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][100/625] eta 0:05:05 lr 0.000724 wd 0.0500 time 0.5771 (0.5817) data time 0.0009 (0.0061) model time 0.5762 (0.5738) loss 6.8320 (7.5118) grad_norm 2.1938 (2.5418) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:32:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][110/625] eta 0:04:59 lr 0.000724 wd 0.0500 time 0.5762 (0.5811) data time 0.0006 (0.0056) model time 0.5755 (0.5739) loss 8.1140 (7.5092) grad_norm 2.0805 (2.4977) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:32:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][120/625] eta 0:04:53 lr 0.000724 wd 0.0500 time 0.5747 (0.5806) data time 0.0006 (0.0052) model time 0.5741 (0.5739) loss 8.3844 (7.4690) grad_norm 3.7261 (2.5482) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:32:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][130/625] eta 0:04:47 lr 0.000724 wd 0.0500 time 0.5718 (0.5801) data time 0.0008 (0.0048) model time 0.5710 (0.5738) loss 7.3030 (7.4513) grad_norm 4.5482 (2.6035) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:33:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][140/625] eta 0:04:41 lr 0.000724 wd 0.0500 time 0.5757 (0.5795) data time 0.0007 (0.0046) model time 0.5750 (0.5735) loss 7.5172 (7.5018) grad_norm 2.8808 (2.6374) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:33:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][150/625] eta 0:04:35 lr 0.000724 wd 0.0500 time 0.5755 (0.5792) data time 0.0008 (0.0043) model time 0.5747 (0.5735) loss 7.7720 (7.4807) grad_norm 2.9138 (2.6065) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:33:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][160/625] eta 0:04:29 lr 0.000723 wd 0.0500 time 0.7142 (0.5800) data time 0.0006 (0.0041) model time 0.7136 (0.5751) loss 8.3338 (7.4624) grad_norm 3.2233 (2.5817) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:33:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][170/625] eta 0:04:24 lr 0.000723 wd 0.0500 time 0.5712 (0.5814) data time 0.0006 (0.0039) model time 0.5706 (0.5775) loss 6.8760 (7.4730) grad_norm 2.8036 (2.5680) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:33:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][180/625] eta 0:04:20 lr 0.000723 wd 0.0500 time 0.7362 (0.5850) data time 0.0006 (0.0038) model time 0.7356 (0.5826) loss 8.2790 (7.5005) grad_norm 2.2932 (2.5514) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:33:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][190/625] eta 0:04:15 lr 0.000723 wd 0.0500 time 0.5840 (0.5884) data time 0.0006 (0.0036) model time 0.5834 (0.5874) loss 7.5696 (7.4904) grad_norm 2.4265 (2.5314) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:33:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][200/625] eta 0:04:11 lr 0.000723 wd 0.0500 time 0.5789 (0.5908) data time 0.0008 (0.0035) model time 0.5781 (0.5907) loss 8.3769 (7.4776) grad_norm 1.8037 (2.5101) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:33:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][210/625] eta 0:04:04 lr 0.000723 wd 0.0500 time 0.5745 (0.5900) data time 0.0006 (0.0033) model time 0.5739 (0.5896) loss 7.5598 (7.4814) grad_norm 1.8464 (2.4822) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:33:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][220/625] eta 0:03:58 lr 0.000723 wd 0.0500 time 0.5735 (0.5896) data time 0.0008 (0.0032) model time 0.5727 (0.5890) loss 7.9655 (7.4840) grad_norm 2.1331 (2.4607) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 04:33:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][230/625] eta 0:03:52 lr 0.000723 wd 0.0500 time 0.5769 (0.5889) data time 0.0007 (0.0031) model time 0.5762 (0.5881) loss 7.2254 (7.4991) grad_norm 1.9301 (inf) loss_scale 1024.0000 (2016.9697) mem 22339MB +[2024-07-25 04:34:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][240/625] eta 0:03:46 lr 0.000723 wd 0.0500 time 0.5755 (0.5883) data time 0.0006 (0.0030) model time 0.5749 (0.5874) loss 6.6173 (7.4970) grad_norm 2.7317 (inf) loss_scale 1024.0000 (1975.7676) mem 22339MB +[2024-07-25 04:34:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][250/625] eta 0:03:40 lr 0.000723 wd 0.0500 time 0.5740 (0.5878) data time 0.0008 (0.0029) model time 0.5732 (0.5867) loss 6.4235 (7.4801) grad_norm 2.2579 (inf) loss_scale 1024.0000 (1937.8486) mem 22339MB +[2024-07-25 04:34:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][260/625] eta 0:03:34 lr 0.000722 wd 0.0500 time 0.5760 (0.5874) data time 0.0007 (0.0029) model time 0.5753 (0.5862) loss 5.7839 (7.4784) grad_norm 2.3389 (inf) loss_scale 1024.0000 (1902.8352) mem 22339MB +[2024-07-25 04:34:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][270/625] eta 0:03:28 lr 0.000722 wd 0.0500 time 0.5725 (0.5869) data time 0.0008 (0.0028) model time 0.5718 (0.5857) loss 9.0308 (7.4889) grad_norm 2.0898 (inf) loss_scale 1024.0000 (1870.4059) mem 22339MB +[2024-07-25 04:34:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][280/625] eta 0:03:22 lr 0.000722 wd 0.0500 time 0.5759 (0.5866) data time 0.0006 (0.0027) model time 0.5753 (0.5853) loss 7.0424 (7.4938) grad_norm 2.3253 (inf) loss_scale 1024.0000 (1840.2847) mem 22339MB +[2024-07-25 04:34:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][290/625] eta 0:03:16 lr 0.000722 wd 0.0500 time 0.5770 (0.5862) data time 0.0006 (0.0026) model time 0.5765 (0.5848) loss 7.5346 (7.5046) grad_norm 2.7152 (inf) loss_scale 1024.0000 (1812.2337) mem 22339MB +[2024-07-25 04:34:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][300/625] eta 0:03:10 lr 0.000722 wd 0.0500 time 0.5749 (0.5858) data time 0.0006 (0.0026) model time 0.5743 (0.5844) loss 7.9869 (7.5168) grad_norm 1.7748 (inf) loss_scale 1024.0000 (1786.0465) mem 22339MB +[2024-07-25 04:34:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][310/625] eta 0:03:04 lr 0.000722 wd 0.0500 time 0.5796 (0.5855) data time 0.0007 (0.0025) model time 0.5789 (0.5840) loss 7.4381 (7.5105) grad_norm 3.5480 (inf) loss_scale 1024.0000 (1761.5434) mem 22339MB +[2024-07-25 04:34:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][320/625] eta 0:02:58 lr 0.000722 wd 0.0500 time 0.5782 (0.5852) data time 0.0008 (0.0025) model time 0.5774 (0.5837) loss 7.2586 (7.5213) grad_norm 2.2421 (inf) loss_scale 1024.0000 (1738.5670) mem 22339MB +[2024-07-25 04:34:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][330/625] eta 0:02:52 lr 0.000722 wd 0.0500 time 0.6255 (0.5851) data time 0.0008 (0.0024) model time 0.6247 (0.5836) loss 6.9691 (7.5331) grad_norm 2.3054 (inf) loss_scale 1024.0000 (1716.9789) mem 22339MB +[2024-07-25 04:35:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][340/625] eta 0:02:46 lr 0.000722 wd 0.0500 time 0.5742 (0.5848) data time 0.0006 (0.0024) model time 0.5736 (0.5832) loss 6.1557 (7.5299) grad_norm 7.8205 (inf) loss_scale 1024.0000 (1696.6569) mem 22339MB +[2024-07-25 04:35:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][350/625] eta 0:02:40 lr 0.000721 wd 0.0500 time 0.5765 (0.5845) data time 0.0006 (0.0023) model time 0.5759 (0.5830) loss 7.7549 (7.5458) grad_norm 2.9580 (inf) loss_scale 1024.0000 (1677.4929) mem 22339MB +[2024-07-25 04:35:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][360/625] eta 0:02:34 lr 0.000721 wd 0.0500 time 0.5800 (0.5842) data time 0.0006 (0.0023) model time 0.5794 (0.5827) loss 7.9191 (7.5537) grad_norm 1.8423 (inf) loss_scale 1024.0000 (1659.3906) mem 22339MB +[2024-07-25 04:35:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][370/625] eta 0:02:28 lr 0.000721 wd 0.0500 time 0.5750 (0.5840) data time 0.0008 (0.0022) model time 0.5742 (0.5824) loss 7.5531 (7.5597) grad_norm 1.5905 (inf) loss_scale 1024.0000 (1642.2642) mem 22339MB +[2024-07-25 04:35:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][380/625] eta 0:02:23 lr 0.000721 wd 0.0500 time 0.7027 (0.5842) data time 0.0007 (0.0022) model time 0.7021 (0.5827) loss 6.5372 (7.5558) grad_norm 1.8610 (inf) loss_scale 1024.0000 (1626.0367) mem 22339MB +[2024-07-25 04:35:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][390/625] eta 0:02:17 lr 0.000721 wd 0.0500 time 0.5755 (0.5850) data time 0.0006 (0.0022) model time 0.5749 (0.5836) loss 7.7838 (7.5503) grad_norm 1.7372 (inf) loss_scale 1024.0000 (1610.6394) mem 22339MB +[2024-07-25 04:35:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][400/625] eta 0:02:11 lr 0.000721 wd 0.0500 time 0.5741 (0.5860) data time 0.0006 (0.0021) model time 0.5735 (0.5848) loss 5.8166 (7.5459) grad_norm 2.9657 (inf) loss_scale 1024.0000 (1596.0100) mem 22339MB +[2024-07-25 04:35:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][410/625] eta 0:02:06 lr 0.000721 wd 0.0500 time 0.5740 (0.5869) data time 0.0007 (0.0021) model time 0.5733 (0.5859) loss 8.8231 (7.5453) grad_norm 2.2436 (inf) loss_scale 1024.0000 (1582.0925) mem 22339MB +[2024-07-25 04:35:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][420/625] eta 0:02:00 lr 0.000721 wd 0.0500 time 0.5952 (0.5881) data time 0.0008 (0.0021) model time 0.5944 (0.5872) loss 7.4095 (7.5454) grad_norm 2.2254 (inf) loss_scale 1024.0000 (1568.8361) mem 22339MB +[2024-07-25 04:35:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][430/625] eta 0:01:54 lr 0.000721 wd 0.0500 time 0.5799 (0.5878) data time 0.0006 (0.0020) model time 0.5794 (0.5868) loss 8.3482 (7.5394) grad_norm 2.3833 (inf) loss_scale 1024.0000 (1556.1949) mem 22339MB +[2024-07-25 04:36:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][440/625] eta 0:01:48 lr 0.000721 wd 0.0500 time 0.5739 (0.5876) data time 0.0008 (0.0020) model time 0.5731 (0.5867) loss 8.7038 (7.5434) grad_norm 1.5459 (inf) loss_scale 1024.0000 (1544.1270) mem 22339MB +[2024-07-25 04:36:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][450/625] eta 0:01:42 lr 0.000720 wd 0.0500 time 0.5593 (0.5874) data time 0.0009 (0.0020) model time 0.5584 (0.5864) loss 6.4917 (7.5363) grad_norm 2.5546 (inf) loss_scale 1024.0000 (1532.5942) mem 22339MB +[2024-07-25 04:36:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][460/625] eta 0:01:36 lr 0.000720 wd 0.0500 time 0.5764 (0.5871) data time 0.0008 (0.0020) model time 0.5756 (0.5861) loss 7.9914 (7.5241) grad_norm 1.7398 (inf) loss_scale 1024.0000 (1521.5618) mem 22339MB +[2024-07-25 04:36:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][470/625] eta 0:01:30 lr 0.000720 wd 0.0500 time 0.5735 (0.5869) data time 0.0006 (0.0019) model time 0.5729 (0.5858) loss 6.9489 (7.5138) grad_norm 1.8634 (inf) loss_scale 1024.0000 (1510.9979) mem 22339MB +[2024-07-25 04:36:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][480/625] eta 0:01:25 lr 0.000720 wd 0.0500 time 0.5745 (0.5866) data time 0.0007 (0.0019) model time 0.5738 (0.5855) loss 7.4088 (7.5000) grad_norm 2.5935 (inf) loss_scale 1024.0000 (1500.8732) mem 22339MB +[2024-07-25 04:36:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][490/625] eta 0:01:19 lr 0.000720 wd 0.0500 time 0.5856 (0.5864) data time 0.0006 (0.0019) model time 0.5850 (0.5853) loss 7.2318 (7.4960) grad_norm 4.4233 (inf) loss_scale 1024.0000 (1491.1609) mem 22339MB +[2024-07-25 04:36:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][500/625] eta 0:01:13 lr 0.000720 wd 0.0500 time 0.5751 (0.5862) data time 0.0008 (0.0019) model time 0.5742 (0.5851) loss 10.2285 (7.5009) grad_norm 2.6361 (inf) loss_scale 1024.0000 (1481.8363) mem 22339MB +[2024-07-25 04:36:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][510/625] eta 0:01:07 lr 0.000720 wd 0.0500 time 0.5786 (0.5859) data time 0.0006 (0.0018) model time 0.5780 (0.5848) loss 7.2519 (7.5048) grad_norm 1.7722 (inf) loss_scale 1024.0000 (1472.8767) mem 22339MB +[2024-07-25 04:36:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][520/625] eta 0:01:01 lr 0.000720 wd 0.0500 time 0.5746 (0.5857) data time 0.0006 (0.0018) model time 0.5740 (0.5846) loss 8.6952 (7.5073) grad_norm 3.2747 (inf) loss_scale 1024.0000 (1464.2610) mem 22339MB +[2024-07-25 04:36:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][530/625] eta 0:00:55 lr 0.000720 wd 0.0500 time 0.5751 (0.5855) data time 0.0006 (0.0018) model time 0.5745 (0.5844) loss 7.0965 (7.5146) grad_norm 2.4016 (inf) loss_scale 1024.0000 (1455.9699) mem 22339MB +[2024-07-25 04:36:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][540/625] eta 0:00:49 lr 0.000720 wd 0.0500 time 0.5820 (0.5854) data time 0.0008 (0.0018) model time 0.5812 (0.5842) loss 8.6076 (7.5141) grad_norm 2.7160 (inf) loss_scale 1024.0000 (1447.9852) mem 22339MB +[2024-07-25 04:37:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][550/625] eta 0:00:43 lr 0.000719 wd 0.0500 time 0.5769 (0.5852) data time 0.0006 (0.0018) model time 0.5763 (0.5840) loss 7.9093 (7.5278) grad_norm 2.6900 (inf) loss_scale 1024.0000 (1440.2904) mem 22339MB +[2024-07-25 04:37:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][560/625] eta 0:00:38 lr 0.000719 wd 0.0500 time 0.5982 (0.5851) data time 0.0007 (0.0017) model time 0.5975 (0.5839) loss 8.9505 (7.5326) grad_norm 2.4783 (inf) loss_scale 1024.0000 (1432.8699) mem 22339MB +[2024-07-25 04:37:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][570/625] eta 0:00:32 lr 0.000719 wd 0.0500 time 0.5739 (0.5849) data time 0.0007 (0.0017) model time 0.5732 (0.5837) loss 7.6452 (7.5303) grad_norm 2.5456 (inf) loss_scale 1024.0000 (1425.7093) mem 22339MB +[2024-07-25 04:37:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][580/625] eta 0:00:26 lr 0.000719 wd 0.0500 time 0.5740 (0.5847) data time 0.0006 (0.0017) model time 0.5734 (0.5835) loss 8.5795 (7.5341) grad_norm 2.6371 (inf) loss_scale 1024.0000 (1418.7952) mem 22339MB +[2024-07-25 04:37:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][590/625] eta 0:00:20 lr 0.000719 wd 0.0500 time 0.5754 (0.5845) data time 0.0006 (0.0017) model time 0.5748 (0.5833) loss 7.3452 (7.5313) grad_norm 1.8065 (inf) loss_scale 512.0000 (1405.1844) mem 22339MB +[2024-07-25 04:37:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][600/625] eta 0:00:14 lr 0.000719 wd 0.0500 time 0.7325 (0.5849) data time 0.0006 (0.0017) model time 0.7319 (0.5837) loss 5.9397 (7.5274) grad_norm 1.5765 (inf) loss_scale 512.0000 (1390.3228) mem 22339MB +[2024-07-25 04:37:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][610/625] eta 0:00:08 lr 0.000719 wd 0.0500 time 0.7027 (0.5857) data time 0.0004 (0.0017) model time 0.7024 (0.5846) loss 8.0899 (7.5313) grad_norm 2.4670 (inf) loss_scale 512.0000 (1375.9476) mem 22339MB +[2024-07-25 04:37:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [142/300][620/625] eta 0:00:02 lr 0.000719 wd 0.0500 time 0.5763 (0.5861) data time 0.0005 (0.0017) model time 0.5758 (0.5850) loss 7.3799 (7.5328) grad_norm 2.4597 (inf) loss_scale 512.0000 (1362.0354) mem 22339MB +[2024-07-25 04:37:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 142 training takes 0:06:06 +[2024-07-25 04:37:47 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 04:37:48 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 04:37:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.718 (0.718) Loss 0.5264 (0.5264) Acc@1 88.574 (88.574) Acc@5 98.633 (98.633) Mem 22339MB +[2024-07-25 04:37:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.179) Loss 0.8569 (0.6582) Acc@1 80.225 (85.929) Acc@5 95.654 (97.621) Mem 22339MB +[2024-07-25 04:37:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.154) Loss 0.9795 (0.7743) Acc@1 76.221 (82.664) Acc@5 94.385 (96.377) Mem 22339MB +[2024-07-25 04:37:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.330 Acc@5 96.371 +[2024-07-25 04:37:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.3% +[2024-07-25 04:37:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.789 (0.789) Loss 0.4946 (0.4946) Acc@1 89.697 (89.697) Acc@5 98.779 (98.779) Mem 22339MB +[2024-07-25 04:37:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.186) Loss 0.7783 (0.6259) Acc@1 81.494 (86.594) Acc@5 96.240 (97.718) Mem 22339MB +[2024-07-25 04:37:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.158) Loss 0.9087 (0.7319) Acc@1 77.490 (83.373) Acc@5 95.361 (96.675) Mem 22339MB +[2024-07-25 04:37:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.027 Acc@5 96.677 +[2024-07-25 04:37:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.0% +[2024-07-25 04:37:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.03% +[2024-07-25 04:37:56 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 04:37:58 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 04:37:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][0/625] eta 0:08:56 lr 0.000719 wd 0.0500 time 0.8579 (0.8579) data time 0.3370 (0.3370) model time 0.0000 (0.0000) loss 7.5706 (7.5706) grad_norm 3.4426 (3.4426) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:38:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][10/625] eta 0:06:59 lr 0.000719 wd 0.0500 time 0.5727 (0.6817) data time 0.0006 (0.0314) model time 0.0000 (0.0000) loss 6.8072 (7.7727) grad_norm 2.1433 (2.3527) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:38:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][20/625] eta 0:06:25 lr 0.000718 wd 0.0500 time 0.5656 (0.6365) data time 0.0007 (0.0168) model time 0.0000 (0.0000) loss 8.2933 (7.3779) grad_norm 1.9107 (2.2110) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:38:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][30/625] eta 0:06:06 lr 0.000718 wd 0.0500 time 0.5741 (0.6164) data time 0.0008 (0.0116) model time 0.0000 (0.0000) loss 8.5296 (7.2847) grad_norm 1.8490 (2.2440) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:38:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][40/625] eta 0:05:54 lr 0.000718 wd 0.0500 time 0.5730 (0.6061) data time 0.0006 (0.0090) model time 0.0000 (0.0000) loss 9.4993 (7.4141) grad_norm 2.9245 (2.2195) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:38:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][50/625] eta 0:05:45 lr 0.000718 wd 0.0500 time 0.5751 (0.6002) data time 0.0006 (0.0074) model time 0.0000 (0.0000) loss 6.6774 (7.4062) grad_norm 2.0602 (2.2100) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:38:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][60/625] eta 0:05:36 lr 0.000718 wd 0.0500 time 0.5736 (0.5960) data time 0.0007 (0.0063) model time 0.5729 (0.5738) loss 6.6795 (7.3899) grad_norm 1.8952 (2.2549) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:38:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][70/625] eta 0:05:28 lr 0.000718 wd 0.0500 time 0.5703 (0.5927) data time 0.0007 (0.0055) model time 0.5696 (0.5727) loss 8.4869 (7.4718) grad_norm 1.5799 (2.2452) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:38:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][80/625] eta 0:05:21 lr 0.000718 wd 0.0500 time 0.5674 (0.5902) data time 0.0008 (0.0049) model time 0.5667 (0.5723) loss 6.8427 (7.4856) grad_norm 2.0964 (2.1991) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:38:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][90/625] eta 0:05:14 lr 0.000718 wd 0.0500 time 0.5714 (0.5884) data time 0.0006 (0.0045) model time 0.5708 (0.5725) loss 5.6181 (7.4565) grad_norm 2.4516 (2.1805) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:38:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][100/625] eta 0:05:08 lr 0.000718 wd 0.0500 time 0.5723 (0.5869) data time 0.0006 (0.0041) model time 0.5716 (0.5725) loss 8.6122 (7.4535) grad_norm 2.0163 (2.1704) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:39:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][110/625] eta 0:05:01 lr 0.000717 wd 0.0500 time 0.5716 (0.5858) data time 0.0008 (0.0038) model time 0.5708 (0.5727) loss 8.6363 (7.4480) grad_norm 1.9405 (2.1797) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:39:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][120/625] eta 0:04:55 lr 0.000717 wd 0.0500 time 0.5739 (0.5849) data time 0.0006 (0.0036) model time 0.5733 (0.5730) loss 7.9911 (7.4803) grad_norm 1.9348 (2.2268) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:39:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][130/625] eta 0:04:49 lr 0.000717 wd 0.0500 time 0.5699 (0.5841) data time 0.0006 (0.0034) model time 0.5693 (0.5731) loss 7.6721 (7.4719) grad_norm 3.1272 (2.2150) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:39:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][140/625] eta 0:04:42 lr 0.000717 wd 0.0500 time 0.5723 (0.5835) data time 0.0008 (0.0032) model time 0.5714 (0.5732) loss 6.9913 (7.5007) grad_norm 2.3667 (2.2172) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:39:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][150/625] eta 0:04:36 lr 0.000717 wd 0.0500 time 0.5701 (0.5828) data time 0.0007 (0.0030) model time 0.5693 (0.5732) loss 8.5632 (7.5174) grad_norm 2.0787 (2.2037) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:39:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][160/625] eta 0:04:30 lr 0.000717 wd 0.0500 time 0.5746 (0.5824) data time 0.0006 (0.0029) model time 0.5739 (0.5734) loss 6.9677 (7.5203) grad_norm 2.7803 (2.1992) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:39:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][170/625] eta 0:04:24 lr 0.000717 wd 0.0500 time 0.5732 (0.5821) data time 0.0008 (0.0028) model time 0.5724 (0.5736) loss 7.1272 (7.5245) grad_norm 1.6839 (2.1952) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:39:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][180/625] eta 0:04:18 lr 0.000717 wd 0.0500 time 0.5751 (0.5817) data time 0.0006 (0.0026) model time 0.5745 (0.5736) loss 6.8559 (7.5442) grad_norm 1.8514 (2.1903) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:39:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][190/625] eta 0:04:12 lr 0.000717 wd 0.0500 time 0.5705 (0.5814) data time 0.0006 (0.0025) model time 0.5700 (0.5737) loss 6.2297 (7.5476) grad_norm 1.7928 (2.1893) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:39:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][200/625] eta 0:04:07 lr 0.000717 wd 0.0500 time 0.5731 (0.5823) data time 0.0008 (0.0025) model time 0.5723 (0.5755) loss 6.9296 (7.5713) grad_norm 2.4723 (2.2338) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:40:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][210/625] eta 0:04:02 lr 0.000716 wd 0.0500 time 0.5678 (0.5853) data time 0.0006 (0.0024) model time 0.5672 (0.5797) loss 8.5768 (7.5693) grad_norm 4.8808 (2.2549) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:40:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][220/625] eta 0:03:57 lr 0.000716 wd 0.0500 time 0.5700 (0.5873) data time 0.0006 (0.0023) model time 0.5694 (0.5827) loss 5.8580 (7.5499) grad_norm 1.9968 (2.3034) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:40:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][230/625] eta 0:03:53 lr 0.000716 wd 0.0500 time 0.7328 (0.5902) data time 0.0007 (0.0022) model time 0.7321 (0.5865) loss 5.7459 (7.5573) grad_norm 1.4513 (2.2952) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:40:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][240/625] eta 0:03:47 lr 0.000716 wd 0.0500 time 0.5723 (0.5897) data time 0.0008 (0.0022) model time 0.5715 (0.5861) loss 6.4309 (7.5265) grad_norm 2.2366 (2.2831) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:40:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][250/625] eta 0:03:40 lr 0.000716 wd 0.0500 time 0.5728 (0.5892) data time 0.0008 (0.0021) model time 0.5720 (0.5856) loss 7.8360 (7.5282) grad_norm 2.3817 (2.2735) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:40:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][260/625] eta 0:03:34 lr 0.000716 wd 0.0500 time 0.5735 (0.5887) data time 0.0006 (0.0021) model time 0.5728 (0.5851) loss 8.0971 (7.5334) grad_norm 2.1254 (2.2690) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:40:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][270/625] eta 0:03:28 lr 0.000716 wd 0.0500 time 0.5680 (0.5881) data time 0.0008 (0.0020) model time 0.5672 (0.5845) loss 7.5745 (7.5190) grad_norm 2.5115 (2.2773) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:40:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][280/625] eta 0:03:22 lr 0.000716 wd 0.0500 time 0.5736 (0.5876) data time 0.0006 (0.0020) model time 0.5730 (0.5840) loss 6.8933 (7.5034) grad_norm 1.5946 (2.2744) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:40:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][290/625] eta 0:03:16 lr 0.000716 wd 0.0500 time 0.5708 (0.5871) data time 0.0008 (0.0020) model time 0.5700 (0.5835) loss 8.1030 (7.4871) grad_norm 2.3782 (2.2685) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:40:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][300/625] eta 0:03:10 lr 0.000715 wd 0.0500 time 0.5721 (0.5866) data time 0.0008 (0.0019) model time 0.5712 (0.5831) loss 8.3635 (7.4938) grad_norm 2.1934 (2.2619) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:41:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][310/625] eta 0:03:04 lr 0.000715 wd 0.0500 time 0.5717 (0.5862) data time 0.0008 (0.0019) model time 0.5709 (0.5827) loss 6.7994 (7.4749) grad_norm 2.1595 (2.2590) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:41:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][320/625] eta 0:02:58 lr 0.000715 wd 0.0500 time 0.5728 (0.5858) data time 0.0006 (0.0018) model time 0.5722 (0.5823) loss 6.3552 (7.4785) grad_norm 1.6484 (2.2543) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:41:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][330/625] eta 0:02:52 lr 0.000715 wd 0.0500 time 0.5701 (0.5855) data time 0.0006 (0.0018) model time 0.5694 (0.5821) loss 8.0181 (7.4829) grad_norm 1.6546 (2.2488) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:41:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][340/625] eta 0:02:46 lr 0.000715 wd 0.0500 time 0.5714 (0.5852) data time 0.0006 (0.0018) model time 0.5708 (0.5818) loss 7.8324 (7.4875) grad_norm 1.8699 (2.2520) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:41:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][350/625] eta 0:02:40 lr 0.000715 wd 0.0500 time 0.5740 (0.5849) data time 0.0006 (0.0018) model time 0.5734 (0.5815) loss 7.2746 (7.4922) grad_norm 2.5694 (2.2502) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:41:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][360/625] eta 0:02:34 lr 0.000715 wd 0.0500 time 0.5708 (0.5846) data time 0.0007 (0.0017) model time 0.5701 (0.5813) loss 6.6187 (7.4991) grad_norm 1.6101 (2.2413) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:41:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][370/625] eta 0:02:29 lr 0.000715 wd 0.0500 time 0.5762 (0.5844) data time 0.0006 (0.0017) model time 0.5756 (0.5811) loss 6.9097 (7.4859) grad_norm 2.2388 (2.2363) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:41:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][380/625] eta 0:02:23 lr 0.000715 wd 0.0500 time 0.5729 (0.5841) data time 0.0006 (0.0017) model time 0.5723 (0.5809) loss 7.1282 (7.4912) grad_norm 2.6419 (2.2420) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:41:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][390/625] eta 0:02:17 lr 0.000715 wd 0.0500 time 0.5717 (0.5839) data time 0.0008 (0.0017) model time 0.5710 (0.5807) loss 8.9256 (7.5057) grad_norm 3.7898 (2.2621) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:41:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][400/625] eta 0:02:11 lr 0.000714 wd 0.0500 time 0.5731 (0.5837) data time 0.0007 (0.0016) model time 0.5724 (0.5805) loss 5.8045 (7.5158) grad_norm 1.8415 (2.2667) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:41:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][410/625] eta 0:02:05 lr 0.000714 wd 0.0500 time 0.5708 (0.5835) data time 0.0007 (0.0016) model time 0.5701 (0.5803) loss 6.0481 (7.5197) grad_norm 1.8788 (2.2654) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:42:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][420/625] eta 0:01:59 lr 0.000714 wd 0.0500 time 0.5742 (0.5840) data time 0.0006 (0.0016) model time 0.5736 (0.5810) loss 8.2869 (7.5262) grad_norm 2.1955 (2.2647) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:42:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][430/625] eta 0:01:54 lr 0.000714 wd 0.0500 time 0.5716 (0.5850) data time 0.0007 (0.0016) model time 0.5709 (0.5822) loss 8.8371 (7.5219) grad_norm 2.3183 (2.2603) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:42:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][440/625] eta 0:01:48 lr 0.000714 wd 0.0500 time 0.6995 (0.5861) data time 0.0006 (0.0016) model time 0.6988 (0.5835) loss 7.4676 (7.5218) grad_norm 1.9121 (2.2623) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:42:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][450/625] eta 0:01:42 lr 0.000714 wd 0.0500 time 0.7395 (0.5872) data time 0.0006 (0.0015) model time 0.7388 (0.5848) loss 8.6701 (7.5194) grad_norm 2.1245 (2.2569) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:42:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][460/625] eta 0:01:36 lr 0.000714 wd 0.0500 time 0.5734 (0.5873) data time 0.0006 (0.0015) model time 0.5727 (0.5849) loss 7.2294 (7.5141) grad_norm 2.0505 (2.2562) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:42:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][470/625] eta 0:01:30 lr 0.000714 wd 0.0500 time 0.5697 (0.5870) data time 0.0008 (0.0015) model time 0.5689 (0.5846) loss 7.7287 (7.5125) grad_norm 3.8918 (2.2715) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:42:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][480/625] eta 0:01:25 lr 0.000714 wd 0.0500 time 0.5755 (0.5867) data time 0.0008 (0.0015) model time 0.5748 (0.5843) loss 7.6517 (7.5158) grad_norm 4.1076 (2.2840) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:42:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][490/625] eta 0:01:19 lr 0.000713 wd 0.0500 time 0.5738 (0.5865) data time 0.0006 (0.0015) model time 0.5732 (0.5841) loss 8.9737 (7.5250) grad_norm 2.6110 (2.2832) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:42:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][500/625] eta 0:01:13 lr 0.000713 wd 0.0500 time 0.5668 (0.5863) data time 0.0008 (0.0015) model time 0.5659 (0.5839) loss 8.8250 (7.5270) grad_norm 2.8595 (2.2846) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:42:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][510/625] eta 0:01:07 lr 0.000713 wd 0.0500 time 0.5617 (0.5861) data time 0.0006 (0.0015) model time 0.5610 (0.5837) loss 6.5890 (7.5277) grad_norm 2.2417 (2.2773) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:43:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][520/625] eta 0:01:01 lr 0.000713 wd 0.0500 time 0.5703 (0.5859) data time 0.0006 (0.0015) model time 0.5697 (0.5835) loss 7.6441 (7.5218) grad_norm 1.7259 (2.2695) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:43:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][530/625] eta 0:00:55 lr 0.000713 wd 0.0500 time 0.5741 (0.5857) data time 0.0006 (0.0014) model time 0.5735 (0.5833) loss 7.3370 (7.5145) grad_norm 1.7200 (2.2664) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:43:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][540/625] eta 0:00:49 lr 0.000713 wd 0.0500 time 0.5755 (0.5857) data time 0.0008 (0.0014) model time 0.5746 (0.5833) loss 7.5954 (7.5146) grad_norm 2.5305 (2.2668) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:43:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][550/625] eta 0:00:43 lr 0.000713 wd 0.0500 time 0.5709 (0.5856) data time 0.0006 (0.0014) model time 0.5703 (0.5832) loss 7.7011 (7.5141) grad_norm 3.3880 (2.2954) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:43:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][560/625] eta 0:00:38 lr 0.000713 wd 0.0500 time 0.5684 (0.5855) data time 0.0008 (0.0014) model time 0.5676 (0.5831) loss 6.8703 (7.5114) grad_norm 2.3083 (2.3021) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:43:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][570/625] eta 0:00:32 lr 0.000713 wd 0.0500 time 0.5760 (0.5853) data time 0.0008 (0.0014) model time 0.5752 (0.5830) loss 7.4742 (7.5095) grad_norm 1.6635 (2.2946) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:43:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][580/625] eta 0:00:26 lr 0.000713 wd 0.0500 time 0.5704 (0.5852) data time 0.0006 (0.0014) model time 0.5698 (0.5829) loss 6.8130 (7.5094) grad_norm 2.2671 (2.2909) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:43:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][590/625] eta 0:00:20 lr 0.000712 wd 0.0500 time 0.5603 (0.5851) data time 0.0008 (0.0014) model time 0.5595 (0.5828) loss 6.7000 (7.5136) grad_norm 1.7269 (2.2870) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:43:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][600/625] eta 0:00:14 lr 0.000712 wd 0.0500 time 0.5697 (0.5849) data time 0.0005 (0.0014) model time 0.5692 (0.5826) loss 6.0335 (7.4985) grad_norm 2.3008 (2.2887) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:43:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][610/625] eta 0:00:08 lr 0.000712 wd 0.0500 time 0.5737 (0.5848) data time 0.0004 (0.0014) model time 0.5733 (0.5825) loss 6.2560 (7.4980) grad_norm 1.6358 (2.2900) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:44:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [143/300][620/625] eta 0:00:02 lr 0.000712 wd 0.0500 time 0.5732 (0.5846) data time 0.0005 (0.0014) model time 0.5727 (0.5824) loss 9.4271 (7.5067) grad_norm 1.6487 (2.2867) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:44:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 143 training takes 0:06:05 +[2024-07-25 04:44:03 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 04:44:05 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 04:44:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.489 (0.489) Loss 0.5156 (0.5156) Acc@1 89.160 (89.160) Acc@5 98.486 (98.486) Mem 22339MB +[2024-07-25 04:44:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.159) Loss 0.8193 (0.6534) Acc@1 81.396 (85.929) Acc@5 95.898 (97.661) Mem 22339MB +[2024-07-25 04:44:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.143) Loss 0.9678 (0.7681) Acc@1 76.367 (82.775) Acc@5 94.238 (96.384) Mem 22339MB +[2024-07-25 04:44:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.378 Acc@5 96.365 +[2024-07-25 04:44:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.4% +[2024-07-25 04:44:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.822 (0.822) Loss 0.4944 (0.4944) Acc@1 89.746 (89.746) Acc@5 98.779 (98.779) Mem 22339MB +[2024-07-25 04:44:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.191) Loss 0.7783 (0.6255) Acc@1 81.445 (86.599) Acc@5 96.191 (97.710) Mem 22339MB +[2024-07-25 04:44:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.160) Loss 0.9077 (0.7311) Acc@1 77.441 (83.396) Acc@5 95.312 (96.670) Mem 22339MB +[2024-07-25 04:44:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.039 Acc@5 96.675 +[2024-07-25 04:44:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.0% +[2024-07-25 04:44:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.04% +[2024-07-25 04:44:12 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 04:44:13 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 04:44:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][0/625] eta 0:09:18 lr 0.000712 wd 0.0500 time 0.8937 (0.8937) data time 0.3725 (0.3725) model time 0.0000 (0.0000) loss 7.9829 (7.9829) grad_norm 1.9522 (1.9522) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:44:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][10/625] eta 0:06:15 lr 0.000712 wd 0.0500 time 0.5618 (0.6106) data time 0.0006 (0.0349) model time 0.0000 (0.0000) loss 8.4109 (7.0004) grad_norm 1.4659 (2.3657) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:44:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][20/625] eta 0:06:11 lr 0.000712 wd 0.0500 time 0.6958 (0.6140) data time 0.0008 (0.0187) model time 0.0000 (0.0000) loss 7.3538 (7.1242) grad_norm 2.8098 (2.4847) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:44:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][30/625] eta 0:06:10 lr 0.000712 wd 0.0500 time 0.7128 (0.6233) data time 0.0006 (0.0129) model time 0.0000 (0.0000) loss 6.0397 (7.1410) grad_norm 1.6695 (2.3496) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:44:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][40/625] eta 0:06:08 lr 0.000712 wd 0.0500 time 0.7110 (0.6307) data time 0.0006 (0.0100) model time 0.0000 (0.0000) loss 6.9649 (7.2292) grad_norm 2.3312 (2.3071) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:44:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][50/625] eta 0:06:03 lr 0.000712 wd 0.0500 time 0.7042 (0.6322) data time 0.0007 (0.0082) model time 0.0000 (0.0000) loss 8.7151 (7.3325) grad_norm 2.3868 (2.3168) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:44:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][60/625] eta 0:05:51 lr 0.000711 wd 0.0500 time 0.5732 (0.6227) data time 0.0008 (0.0070) model time 0.5724 (0.5734) loss 6.7290 (7.4094) grad_norm 1.9880 (2.3004) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:44:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][70/625] eta 0:05:41 lr 0.000711 wd 0.0500 time 0.5756 (0.6159) data time 0.0009 (0.0061) model time 0.5747 (0.5735) loss 8.3800 (7.4681) grad_norm 2.3700 (2.2706) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:45:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][80/625] eta 0:05:32 lr 0.000711 wd 0.0500 time 0.5730 (0.6109) data time 0.0008 (0.0055) model time 0.5721 (0.5739) loss 5.6868 (7.4645) grad_norm 2.6934 (2.2684) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:45:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][90/625] eta 0:05:24 lr 0.000711 wd 0.0500 time 0.5741 (0.6069) data time 0.0008 (0.0050) model time 0.5733 (0.5739) loss 9.2427 (7.4814) grad_norm 3.1545 (2.3097) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:45:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][100/625] eta 0:05:17 lr 0.000711 wd 0.0500 time 0.5739 (0.6039) data time 0.0008 (0.0045) model time 0.5731 (0.5742) loss 8.2769 (7.4988) grad_norm 2.6102 (2.3516) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:45:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][110/625] eta 0:05:09 lr 0.000711 wd 0.0500 time 0.5629 (0.6013) data time 0.0008 (0.0042) model time 0.5621 (0.5743) loss 8.1326 (7.4967) grad_norm 3.3600 (2.3884) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:45:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][120/625] eta 0:05:02 lr 0.000711 wd 0.0500 time 0.5766 (0.5991) data time 0.0006 (0.0039) model time 0.5760 (0.5743) loss 8.0246 (7.5389) grad_norm 2.5694 (2.3865) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:45:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][130/625] eta 0:04:55 lr 0.000711 wd 0.0500 time 0.5766 (0.5974) data time 0.0006 (0.0037) model time 0.5760 (0.5744) loss 8.1034 (7.5542) grad_norm 1.9884 (2.3715) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:45:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][140/625] eta 0:04:48 lr 0.000711 wd 0.0500 time 0.5730 (0.5958) data time 0.0006 (0.0035) model time 0.5723 (0.5743) loss 8.4456 (7.5396) grad_norm 1.5683 (2.3656) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:45:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][150/625] eta 0:04:42 lr 0.000710 wd 0.0500 time 0.5714 (0.5944) data time 0.0008 (0.0033) model time 0.5706 (0.5743) loss 8.8953 (7.5377) grad_norm 1.7470 (2.3531) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:45:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][160/625] eta 0:04:35 lr 0.000710 wd 0.0500 time 0.5687 (0.5931) data time 0.0007 (0.0031) model time 0.5680 (0.5742) loss 6.7679 (7.5220) grad_norm 1.5899 (2.3487) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:45:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][170/625] eta 0:04:29 lr 0.000710 wd 0.0500 time 0.5713 (0.5920) data time 0.0006 (0.0030) model time 0.5707 (0.5742) loss 8.0742 (7.5063) grad_norm 1.9413 (2.3478) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:46:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][180/625] eta 0:04:23 lr 0.000710 wd 0.0500 time 0.5723 (0.5911) data time 0.0008 (0.0029) model time 0.5716 (0.5741) loss 7.4805 (7.5183) grad_norm 2.6670 (2.3538) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:46:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][190/625] eta 0:04:17 lr 0.000710 wd 0.0500 time 0.5738 (0.5910) data time 0.0008 (0.0028) model time 0.5730 (0.5752) loss 8.3116 (7.5247) grad_norm 3.6876 (2.3971) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:46:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][200/625] eta 0:04:10 lr 0.000710 wd 0.0500 time 0.5735 (0.5902) data time 0.0006 (0.0027) model time 0.5729 (0.5751) loss 6.3825 (7.4947) grad_norm 2.5108 (2.4202) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:46:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][210/625] eta 0:04:04 lr 0.000710 wd 0.0500 time 0.5746 (0.5895) data time 0.0006 (0.0026) model time 0.5740 (0.5751) loss 6.7643 (7.4797) grad_norm 1.7488 (2.4213) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:46:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][220/625] eta 0:03:58 lr 0.000710 wd 0.0500 time 0.5723 (0.5889) data time 0.0006 (0.0026) model time 0.5717 (0.5750) loss 7.1262 (7.4797) grad_norm 1.7969 (2.4015) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:46:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][230/625] eta 0:03:52 lr 0.000710 wd 0.0500 time 0.5727 (0.5888) data time 0.0007 (0.0025) model time 0.5720 (0.5757) loss 8.0326 (7.5055) grad_norm 1.9009 (2.3802) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:46:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][240/625] eta 0:03:47 lr 0.000710 wd 0.0500 time 0.7491 (0.5902) data time 0.0006 (0.0024) model time 0.7485 (0.5781) loss 8.1979 (7.4989) grad_norm 2.8202 (2.3861) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:46:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][250/625] eta 0:03:41 lr 0.000709 wd 0.0500 time 0.5692 (0.5915) data time 0.0007 (0.0023) model time 0.5685 (0.5803) loss 7.0366 (7.4913) grad_norm 2.2319 (2.3864) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:46:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][260/625] eta 0:03:36 lr 0.000709 wd 0.0500 time 0.7118 (0.5935) data time 0.0008 (0.0023) model time 0.7110 (0.5832) loss 7.9430 (7.4894) grad_norm 2.0836 (2.3886) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:46:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][270/625] eta 0:03:31 lr 0.000709 wd 0.0500 time 0.5737 (0.5950) data time 0.0006 (0.0022) model time 0.5731 (0.5855) loss 7.5137 (7.4860) grad_norm 2.7210 (2.3870) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:47:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][280/625] eta 0:03:25 lr 0.000709 wd 0.0500 time 0.5734 (0.5943) data time 0.0006 (0.0022) model time 0.5728 (0.5851) loss 6.1764 (7.4791) grad_norm 3.0424 (2.4291) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:47:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][290/625] eta 0:03:18 lr 0.000709 wd 0.0500 time 0.5631 (0.5936) data time 0.0008 (0.0021) model time 0.5623 (0.5846) loss 8.5041 (7.4865) grad_norm 2.4630 (2.4394) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:47:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][300/625] eta 0:03:12 lr 0.000709 wd 0.0500 time 0.5629 (0.5930) data time 0.0008 (0.0021) model time 0.5621 (0.5842) loss 7.5756 (7.4846) grad_norm 2.6715 (2.4443) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:47:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][310/625] eta 0:03:06 lr 0.000709 wd 0.0500 time 0.5711 (0.5924) data time 0.0008 (0.0020) model time 0.5703 (0.5838) loss 7.4397 (7.4734) grad_norm 1.6318 (2.4323) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:47:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][320/625] eta 0:03:00 lr 0.000709 wd 0.0500 time 0.5740 (0.5919) data time 0.0008 (0.0020) model time 0.5732 (0.5835) loss 6.7596 (7.4677) grad_norm 2.4272 (2.4383) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:47:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][330/625] eta 0:02:54 lr 0.000709 wd 0.0500 time 0.5747 (0.5914) data time 0.0006 (0.0020) model time 0.5742 (0.5831) loss 6.7252 (7.4749) grad_norm 4.0567 (2.4510) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:47:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][340/625] eta 0:02:48 lr 0.000708 wd 0.0500 time 0.5725 (0.5909) data time 0.0008 (0.0019) model time 0.5717 (0.5829) loss 8.1374 (7.4772) grad_norm 1.7779 (2.4574) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:47:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][350/625] eta 0:02:42 lr 0.000708 wd 0.0500 time 0.5770 (0.5905) data time 0.0008 (0.0019) model time 0.5762 (0.5826) loss 9.4842 (7.4858) grad_norm 2.1990 (2.4528) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:47:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][360/625] eta 0:02:36 lr 0.000708 wd 0.0500 time 0.5763 (0.5901) data time 0.0006 (0.0019) model time 0.5757 (0.5824) loss 6.9447 (7.4778) grad_norm 1.8813 (2.4401) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:47:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][370/625] eta 0:02:30 lr 0.000708 wd 0.0500 time 0.5727 (0.5897) data time 0.0008 (0.0018) model time 0.5720 (0.5821) loss 8.4277 (7.4887) grad_norm 1.4254 (2.4332) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:47:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][380/625] eta 0:02:24 lr 0.000708 wd 0.0500 time 0.5745 (0.5893) data time 0.0006 (0.0018) model time 0.5739 (0.5819) loss 7.1931 (7.4933) grad_norm 4.0355 (2.4563) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:48:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][390/625] eta 0:02:18 lr 0.000708 wd 0.0500 time 0.5713 (0.5889) data time 0.0008 (0.0018) model time 0.5705 (0.5816) loss 7.9195 (7.5190) grad_norm 2.8301 (2.4635) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:48:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][400/625] eta 0:02:12 lr 0.000708 wd 0.0500 time 0.5697 (0.5886) data time 0.0007 (0.0018) model time 0.5690 (0.5815) loss 8.1907 (7.5098) grad_norm 2.5448 (2.4760) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:48:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][410/625] eta 0:02:06 lr 0.000708 wd 0.0500 time 0.5738 (0.5885) data time 0.0008 (0.0017) model time 0.5729 (0.5814) loss 8.3037 (7.5178) grad_norm 2.1884 (2.4731) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:48:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][420/625] eta 0:02:00 lr 0.000708 wd 0.0500 time 0.5737 (0.5882) data time 0.0008 (0.0017) model time 0.5730 (0.5813) loss 7.9495 (7.5192) grad_norm 2.0221 (2.4780) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:48:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][430/625] eta 0:01:54 lr 0.000708 wd 0.0500 time 0.5608 (0.5879) data time 0.0009 (0.0017) model time 0.5599 (0.5811) loss 8.0499 (7.5185) grad_norm 3.0327 (2.4770) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:48:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][440/625] eta 0:01:48 lr 0.000707 wd 0.0500 time 0.5704 (0.5876) data time 0.0008 (0.0017) model time 0.5695 (0.5809) loss 6.2417 (7.5180) grad_norm 1.8143 (2.4729) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:48:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][450/625] eta 0:01:42 lr 0.000707 wd 0.0500 time 0.5745 (0.5875) data time 0.0006 (0.0017) model time 0.5739 (0.5810) loss 7.4889 (7.5003) grad_norm 2.1945 (2.4860) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:48:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][460/625] eta 0:01:37 lr 0.000707 wd 0.0500 time 0.7572 (0.5881) data time 0.0006 (0.0016) model time 0.7565 (0.5818) loss 7.0913 (7.4979) grad_norm 2.0057 (2.4851) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:48:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][470/625] eta 0:01:31 lr 0.000707 wd 0.0500 time 0.5762 (0.5888) data time 0.0006 (0.0016) model time 0.5756 (0.5827) loss 8.9666 (7.4977) grad_norm 1.9209 (2.4778) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:48:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][480/625] eta 0:01:25 lr 0.000707 wd 0.0500 time 0.5716 (0.5900) data time 0.0006 (0.0016) model time 0.5709 (0.5842) loss 7.5145 (7.4826) grad_norm 6.4337 (2.4919) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:49:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][490/625] eta 0:01:19 lr 0.000707 wd 0.0500 time 0.5611 (0.5916) data time 0.0006 (0.0016) model time 0.5605 (0.5861) loss 6.1439 (7.4826) grad_norm 4.9246 (2.5195) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:49:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][500/625] eta 0:01:13 lr 0.000707 wd 0.0500 time 0.5724 (0.5913) data time 0.0011 (0.0016) model time 0.5713 (0.5859) loss 8.6987 (7.4884) grad_norm 2.3289 (2.5294) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:49:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][510/625] eta 0:01:07 lr 0.000707 wd 0.0500 time 0.5716 (0.5910) data time 0.0009 (0.0016) model time 0.5708 (0.5856) loss 9.8612 (7.4927) grad_norm 2.3594 (2.5236) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:49:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][520/625] eta 0:01:02 lr 0.000707 wd 0.0500 time 0.5747 (0.5907) data time 0.0009 (0.0015) model time 0.5738 (0.5854) loss 8.2511 (7.4906) grad_norm 1.8337 (2.5221) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:49:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][530/625] eta 0:00:56 lr 0.000706 wd 0.0500 time 0.5731 (0.5904) data time 0.0008 (0.0015) model time 0.5724 (0.5852) loss 8.6178 (7.4996) grad_norm 1.6741 (2.5205) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:49:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][540/625] eta 0:00:50 lr 0.000706 wd 0.0500 time 0.5737 (0.5901) data time 0.0008 (0.0015) model time 0.5729 (0.5849) loss 7.6795 (7.5028) grad_norm 2.6990 (2.5173) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:49:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][550/625] eta 0:00:44 lr 0.000706 wd 0.0500 time 0.5716 (0.5898) data time 0.0008 (0.0015) model time 0.5709 (0.5847) loss 6.2163 (7.4961) grad_norm 1.8575 (2.5152) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:49:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][560/625] eta 0:00:38 lr 0.000706 wd 0.0500 time 0.5728 (0.5896) data time 0.0007 (0.0015) model time 0.5721 (0.5845) loss 6.3050 (7.4991) grad_norm 4.8817 (2.5176) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:49:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][570/625] eta 0:00:32 lr 0.000706 wd 0.0500 time 0.5692 (0.5893) data time 0.0008 (0.0015) model time 0.5684 (0.5843) loss 8.1801 (7.5051) grad_norm 1.9268 (2.5153) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:49:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][580/625] eta 0:00:26 lr 0.000706 wd 0.0500 time 0.5664 (0.5891) data time 0.0005 (0.0015) model time 0.5658 (0.5841) loss 6.4778 (7.5105) grad_norm 2.0758 (2.5095) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:50:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][590/625] eta 0:00:20 lr 0.000706 wd 0.0500 time 0.5736 (0.5889) data time 0.0007 (0.0015) model time 0.5729 (0.5839) loss 6.6741 (7.5004) grad_norm 3.0236 (2.5035) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:50:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][600/625] eta 0:00:14 lr 0.000706 wd 0.0500 time 0.5705 (0.5886) data time 0.0008 (0.0014) model time 0.5697 (0.5838) loss 8.2661 (7.5005) grad_norm 2.0452 (2.4957) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:50:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][610/625] eta 0:00:08 lr 0.000706 wd 0.0500 time 0.5726 (0.5884) data time 0.0004 (0.0014) model time 0.5722 (0.5836) loss 8.1071 (7.5074) grad_norm 1.8150 (2.4958) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:50:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [144/300][620/625] eta 0:00:02 lr 0.000706 wd 0.0500 time 0.5652 (0.5882) data time 0.0006 (0.0014) model time 0.5646 (0.5834) loss 7.3963 (7.5146) grad_norm 1.6024 (2.4883) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:50:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 144 training takes 0:06:07 +[2024-07-25 04:50:21 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 04:50:23 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 04:50:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.484 (0.484) Loss 0.5161 (0.5161) Acc@1 89.844 (89.844) Acc@5 98.730 (98.730) Mem 22339MB +[2024-07-25 04:50:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.8438 (0.6631) Acc@1 80.615 (85.969) Acc@5 96.094 (97.599) Mem 22339MB +[2024-07-25 04:50:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.9609 (0.7728) Acc@1 77.051 (82.801) Acc@5 94.873 (96.470) Mem 22339MB +[2024-07-25 04:50:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.458 Acc@5 96.453 +[2024-07-25 04:50:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.5% +[2024-07-25 04:50:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.845 (0.845) Loss 0.4937 (0.4937) Acc@1 89.795 (89.795) Acc@5 98.779 (98.779) Mem 22339MB +[2024-07-25 04:50:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.191) Loss 0.7783 (0.6252) Acc@1 81.396 (86.639) Acc@5 96.240 (97.718) Mem 22339MB +[2024-07-25 04:50:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.160) Loss 0.9067 (0.7305) Acc@1 77.539 (83.422) Acc@5 95.312 (96.680) Mem 22339MB +[2024-07-25 04:50:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.065 Acc@5 96.685 +[2024-07-25 04:50:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.1% +[2024-07-25 04:50:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.07% +[2024-07-25 04:50:30 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 04:50:31 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 04:50:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][0/625] eta 0:09:42 lr 0.000705 wd 0.0500 time 0.9323 (0.9323) data time 0.4144 (0.4144) model time 0.0000 (0.0000) loss 8.7474 (8.7474) grad_norm 2.8185 (2.8185) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:50:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][10/625] eta 0:06:16 lr 0.000705 wd 0.0500 time 0.5734 (0.6120) data time 0.0006 (0.0384) model time 0.0000 (0.0000) loss 9.4363 (7.6626) grad_norm 2.2739 (2.4814) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:50:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][20/625] eta 0:05:59 lr 0.000705 wd 0.0500 time 0.5741 (0.5939) data time 0.0008 (0.0205) model time 0.0000 (0.0000) loss 8.4597 (7.5541) grad_norm 1.9252 (2.1880) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:50:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][30/625] eta 0:05:50 lr 0.000705 wd 0.0500 time 0.5732 (0.5885) data time 0.0008 (0.0141) model time 0.0000 (0.0000) loss 7.3985 (7.5163) grad_norm 2.3016 (2.2105) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:50:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][40/625] eta 0:05:44 lr 0.000705 wd 0.0500 time 0.7233 (0.5888) data time 0.0006 (0.0109) model time 0.0000 (0.0000) loss 6.5695 (7.4968) grad_norm 2.3702 (2.1828) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:51:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][50/625] eta 0:05:38 lr 0.000705 wd 0.0500 time 0.7076 (0.5890) data time 0.0008 (0.0089) model time 0.0000 (0.0000) loss 6.5672 (7.4115) grad_norm 2.6681 (2.1540) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:51:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][60/625] eta 0:05:37 lr 0.000705 wd 0.0500 time 0.6753 (0.5976) data time 0.0007 (0.0076) model time 0.6746 (0.6402) loss 8.5492 (7.3638) grad_norm 1.9493 (2.0877) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:51:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][70/625] eta 0:05:34 lr 0.000705 wd 0.0500 time 0.7545 (0.6030) data time 0.0006 (0.0071) model time 0.7538 (0.6359) loss 8.7367 (7.3777) grad_norm 3.5359 (2.1778) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:51:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][80/625] eta 0:05:30 lr 0.000705 wd 0.0500 time 0.7246 (0.6071) data time 0.0008 (0.0064) model time 0.7238 (0.6357) loss 8.3343 (7.4404) grad_norm 4.1639 (2.4249) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:51:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][90/625] eta 0:05:25 lr 0.000705 wd 0.0500 time 0.5688 (0.6081) data time 0.0006 (0.0058) model time 0.5682 (0.6306) loss 6.7841 (7.4431) grad_norm 3.1743 (2.5082) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:51:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][100/625] eta 0:05:17 lr 0.000704 wd 0.0500 time 0.5694 (0.6050) data time 0.0006 (0.0053) model time 0.5688 (0.6198) loss 8.4223 (7.4759) grad_norm 3.6010 (2.5747) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:51:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][110/625] eta 0:05:10 lr 0.000704 wd 0.0500 time 0.5752 (0.6027) data time 0.0006 (0.0049) model time 0.5746 (0.6129) loss 7.7208 (7.4858) grad_norm 1.6899 (2.5291) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:51:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][120/625] eta 0:05:03 lr 0.000704 wd 0.0500 time 0.5678 (0.6006) data time 0.0008 (0.0046) model time 0.5670 (0.6076) loss 7.7763 (7.4752) grad_norm 3.7869 (2.5066) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:51:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][130/625] eta 0:04:56 lr 0.000704 wd 0.0500 time 0.5726 (0.5995) data time 0.0008 (0.0043) model time 0.5718 (0.6048) loss 6.9470 (7.4864) grad_norm 2.1305 (2.4915) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:51:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][140/625] eta 0:04:49 lr 0.000704 wd 0.0500 time 0.5726 (0.5977) data time 0.0007 (0.0040) model time 0.5719 (0.6013) loss 8.9892 (7.4895) grad_norm 1.9569 (2.4568) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:52:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][150/625] eta 0:04:43 lr 0.000704 wd 0.0500 time 0.5724 (0.5964) data time 0.0006 (0.0038) model time 0.5717 (0.5989) loss 7.8520 (7.5049) grad_norm 2.6439 (2.4369) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:52:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][160/625] eta 0:04:36 lr 0.000704 wd 0.0500 time 0.5710 (0.5955) data time 0.0006 (0.0036) model time 0.5703 (0.5973) loss 7.8216 (7.5130) grad_norm 2.3887 (2.4372) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:52:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][170/625] eta 0:04:30 lr 0.000704 wd 0.0500 time 0.5704 (0.5942) data time 0.0009 (0.0035) model time 0.5695 (0.5953) loss 8.6269 (7.5073) grad_norm 2.1722 (2.4175) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:52:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][180/625] eta 0:04:23 lr 0.000704 wd 0.0500 time 0.5752 (0.5932) data time 0.0008 (0.0033) model time 0.5744 (0.5937) loss 6.4682 (7.5116) grad_norm 2.9374 (2.4061) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:52:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][190/625] eta 0:04:17 lr 0.000704 wd 0.0500 time 0.5733 (0.5923) data time 0.0008 (0.0032) model time 0.5725 (0.5924) loss 9.0634 (7.5170) grad_norm 2.8351 (2.4481) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:52:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][200/625] eta 0:04:11 lr 0.000703 wd 0.0500 time 0.5716 (0.5916) data time 0.0009 (0.0031) model time 0.5707 (0.5914) loss 8.2357 (7.5185) grad_norm 2.3473 (2.4481) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:52:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][210/625] eta 0:04:05 lr 0.000703 wd 0.0500 time 0.5706 (0.5909) data time 0.0006 (0.0030) model time 0.5700 (0.5904) loss 8.3558 (7.5342) grad_norm 2.2243 (2.4444) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:52:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][220/625] eta 0:03:59 lr 0.000703 wd 0.0500 time 0.5734 (0.5904) data time 0.0008 (0.0029) model time 0.5725 (0.5898) loss 5.9547 (7.4964) grad_norm 5.6658 (2.4646) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:52:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][230/625] eta 0:03:52 lr 0.000703 wd 0.0500 time 0.5735 (0.5898) data time 0.0009 (0.0028) model time 0.5726 (0.5889) loss 7.9521 (7.4941) grad_norm 2.3771 (2.4689) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:52:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][240/625] eta 0:03:46 lr 0.000703 wd 0.0500 time 0.5710 (0.5892) data time 0.0008 (0.0027) model time 0.5702 (0.5882) loss 7.7537 (7.4767) grad_norm 2.1413 (2.4521) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:52:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][250/625] eta 0:03:40 lr 0.000703 wd 0.0500 time 0.5730 (0.5886) data time 0.0006 (0.0026) model time 0.5723 (0.5874) loss 9.3855 (7.4632) grad_norm 2.1980 (2.4515) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:53:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][260/625] eta 0:03:34 lr 0.000703 wd 0.0500 time 0.5625 (0.5880) data time 0.0008 (0.0026) model time 0.5617 (0.5867) loss 8.0892 (7.4622) grad_norm 1.7793 (2.4649) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:53:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][270/625] eta 0:03:28 lr 0.000703 wd 0.0500 time 0.7066 (0.5885) data time 0.0008 (0.0025) model time 0.7057 (0.5874) loss 6.1485 (7.4430) grad_norm 2.0609 (2.4816) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:53:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][280/625] eta 0:03:23 lr 0.000703 wd 0.0500 time 0.5716 (0.5891) data time 0.0008 (0.0024) model time 0.5708 (0.5881) loss 7.4527 (7.4422) grad_norm 2.5522 (2.4788) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:53:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][290/625] eta 0:03:17 lr 0.000702 wd 0.0500 time 0.5704 (0.5898) data time 0.0010 (0.0024) model time 0.5695 (0.5890) loss 7.4054 (7.4499) grad_norm 2.8502 (2.4780) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:53:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][300/625] eta 0:03:12 lr 0.000702 wd 0.0500 time 0.7080 (0.5918) data time 0.0008 (0.0023) model time 0.7073 (0.5914) loss 6.9899 (7.4435) grad_norm 3.3703 (2.4822) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:53:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][310/625] eta 0:03:06 lr 0.000702 wd 0.0500 time 0.5718 (0.5926) data time 0.0006 (0.0023) model time 0.5713 (0.5923) loss 8.0652 (7.4505) grad_norm 2.0744 (2.5039) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:53:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][320/625] eta 0:03:00 lr 0.000702 wd 0.0500 time 0.5724 (0.5921) data time 0.0006 (0.0023) model time 0.5718 (0.5917) loss 6.6799 (7.4734) grad_norm 1.5526 (2.4971) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:53:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][330/625] eta 0:02:54 lr 0.000702 wd 0.0500 time 0.5636 (0.5917) data time 0.0006 (0.0022) model time 0.5630 (0.5911) loss 6.9095 (7.4870) grad_norm 2.4814 (2.4863) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:53:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][340/625] eta 0:02:48 lr 0.000702 wd 0.0500 time 0.5728 (0.5912) data time 0.0008 (0.0022) model time 0.5720 (0.5905) loss 9.0697 (7.5063) grad_norm 2.2741 (2.4830) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:53:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][350/625] eta 0:02:42 lr 0.000702 wd 0.0500 time 0.5727 (0.5907) data time 0.0008 (0.0021) model time 0.5719 (0.5900) loss 7.6536 (7.5068) grad_norm 2.5273 (2.4885) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:54:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][360/625] eta 0:02:36 lr 0.000702 wd 0.0500 time 0.5737 (0.5903) data time 0.0006 (0.0021) model time 0.5731 (0.5895) loss 6.7302 (7.4982) grad_norm 3.6462 (2.5055) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:54:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][370/625] eta 0:02:30 lr 0.000702 wd 0.0500 time 0.5726 (0.5899) data time 0.0008 (0.0021) model time 0.5718 (0.5891) loss 7.9700 (7.4900) grad_norm 2.1875 (2.5224) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:54:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][380/625] eta 0:02:24 lr 0.000702 wd 0.0500 time 0.5748 (0.5897) data time 0.0006 (0.0020) model time 0.5742 (0.5888) loss 8.2464 (7.4992) grad_norm 2.0806 (2.5166) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:54:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][390/625] eta 0:02:18 lr 0.000701 wd 0.0500 time 0.5729 (0.5893) data time 0.0008 (0.0020) model time 0.5721 (0.5884) loss 8.5951 (7.5064) grad_norm 2.0236 (2.5058) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:54:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][400/625] eta 0:02:12 lr 0.000701 wd 0.0500 time 0.5726 (0.5890) data time 0.0008 (0.0020) model time 0.5718 (0.5880) loss 8.8868 (7.5229) grad_norm 2.5433 (2.5099) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:54:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][410/625] eta 0:02:06 lr 0.000701 wd 0.0500 time 0.5614 (0.5887) data time 0.0009 (0.0019) model time 0.5606 (0.5876) loss 8.5726 (7.5208) grad_norm 1.5923 (2.4958) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:54:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][420/625] eta 0:02:00 lr 0.000701 wd 0.0500 time 0.5731 (0.5883) data time 0.0007 (0.0019) model time 0.5724 (0.5873) loss 8.4054 (7.5124) grad_norm 2.4733 (2.4840) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:54:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][430/625] eta 0:01:54 lr 0.000701 wd 0.0500 time 0.5734 (0.5880) data time 0.0008 (0.0019) model time 0.5726 (0.5869) loss 7.8651 (7.5057) grad_norm 1.9335 (2.4783) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:54:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][440/625] eta 0:01:48 lr 0.000701 wd 0.0500 time 0.5672 (0.5877) data time 0.0008 (0.0019) model time 0.5664 (0.5866) loss 5.5436 (7.4989) grad_norm 2.7970 (2.4703) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:54:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][450/625] eta 0:01:42 lr 0.000701 wd 0.0500 time 0.5725 (0.5874) data time 0.0008 (0.0018) model time 0.5717 (0.5863) loss 6.1689 (7.4983) grad_norm 2.4195 (2.4742) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:55:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][460/625] eta 0:01:36 lr 0.000701 wd 0.0500 time 0.5723 (0.5871) data time 0.0008 (0.0018) model time 0.5715 (0.5860) loss 6.4735 (7.4970) grad_norm 2.3277 (2.4738) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:55:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][470/625] eta 0:01:30 lr 0.000701 wd 0.0500 time 0.5712 (0.5869) data time 0.0006 (0.0018) model time 0.5705 (0.5857) loss 6.9103 (7.4923) grad_norm 2.9767 (2.4809) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:55:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][480/625] eta 0:01:25 lr 0.000700 wd 0.0500 time 0.5727 (0.5866) data time 0.0008 (0.0018) model time 0.5719 (0.5854) loss 8.0121 (7.4907) grad_norm 1.8306 (2.4784) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:55:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][490/625] eta 0:01:19 lr 0.000700 wd 0.0500 time 0.5610 (0.5869) data time 0.0006 (0.0018) model time 0.5604 (0.5858) loss 8.7836 (7.4933) grad_norm 2.2824 (2.4747) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:55:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][500/625] eta 0:01:13 lr 0.000700 wd 0.0500 time 0.5641 (0.5877) data time 0.0006 (0.0017) model time 0.5635 (0.5866) loss 7.8311 (7.5050) grad_norm 1.9692 (2.4772) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:55:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][510/625] eta 0:01:07 lr 0.000700 wd 0.0500 time 0.7317 (0.5888) data time 0.0006 (0.0017) model time 0.7311 (0.5879) loss 6.7112 (7.4950) grad_norm 1.9959 (2.4820) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:55:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][520/625] eta 0:01:01 lr 0.000700 wd 0.0500 time 0.7605 (0.5901) data time 0.0008 (0.0017) model time 0.7597 (0.5893) loss 7.6095 (7.4912) grad_norm 2.2100 (2.4725) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:55:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][530/625] eta 0:00:56 lr 0.000700 wd 0.0500 time 0.5763 (0.5906) data time 0.0006 (0.0017) model time 0.5757 (0.5899) loss 6.4646 (7.4912) grad_norm 1.9376 (2.4686) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:55:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][540/625] eta 0:00:50 lr 0.000700 wd 0.0500 time 0.5776 (0.5903) data time 0.0006 (0.0017) model time 0.5770 (0.5895) loss 7.8796 (7.4888) grad_norm 1.9810 (2.4606) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:55:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][550/625] eta 0:00:44 lr 0.000700 wd 0.0500 time 0.5721 (0.5901) data time 0.0008 (0.0017) model time 0.5713 (0.5893) loss 7.6879 (7.4914) grad_norm 1.8800 (2.4550) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:56:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][560/625] eta 0:00:38 lr 0.000700 wd 0.0500 time 0.5691 (0.5898) data time 0.0009 (0.0016) model time 0.5682 (0.5890) loss 7.1393 (7.4927) grad_norm 2.1925 (2.4495) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:56:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][570/625] eta 0:00:32 lr 0.000700 wd 0.0500 time 0.5682 (0.5896) data time 0.0007 (0.0016) model time 0.5675 (0.5887) loss 9.0287 (7.4961) grad_norm 2.6762 (2.4442) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:56:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][580/625] eta 0:00:26 lr 0.000699 wd 0.0500 time 0.5627 (0.5893) data time 0.0006 (0.0016) model time 0.5621 (0.5884) loss 7.4266 (7.5014) grad_norm 2.6753 (2.4425) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:56:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][590/625] eta 0:00:20 lr 0.000699 wd 0.0500 time 0.5613 (0.5892) data time 0.0008 (0.0016) model time 0.5605 (0.5883) loss 7.2492 (7.4971) grad_norm 2.2657 (2.4380) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:56:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][600/625] eta 0:00:14 lr 0.000699 wd 0.0500 time 0.5717 (0.5893) data time 0.0008 (0.0016) model time 0.5710 (0.5884) loss 5.6739 (7.4930) grad_norm 1.9755 (2.4342) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:56:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][610/625] eta 0:00:08 lr 0.000699 wd 0.0500 time 0.5630 (0.5891) data time 0.0006 (0.0016) model time 0.5624 (0.5882) loss 6.3011 (7.4827) grad_norm 2.1179 (2.4276) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:56:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [145/300][620/625] eta 0:00:02 lr 0.000699 wd 0.0500 time 0.5620 (0.5890) data time 0.0004 (0.0016) model time 0.5616 (0.5880) loss 8.6828 (7.4816) grad_norm 2.8294 (2.4250) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:56:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 145 training takes 0:06:08 +[2024-07-25 04:56:39 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 04:56:41 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 04:56:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.486 (0.486) Loss 0.5356 (0.5356) Acc@1 88.574 (88.574) Acc@5 98.535 (98.535) Mem 22339MB +[2024-07-25 04:56:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.8267 (0.6613) Acc@1 81.201 (85.866) Acc@5 96.338 (97.616) Mem 22339MB +[2024-07-25 04:56:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.143) Loss 0.9590 (0.7746) Acc@1 77.148 (82.703) Acc@5 94.531 (96.440) Mem 22339MB +[2024-07-25 04:56:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.394 Acc@5 96.441 +[2024-07-25 04:56:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.4% +[2024-07-25 04:56:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.991 (0.991) Loss 0.4927 (0.4927) Acc@1 89.746 (89.746) Acc@5 98.779 (98.779) Mem 22339MB +[2024-07-25 04:56:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.204) Loss 0.7769 (0.6247) Acc@1 81.201 (86.603) Acc@5 96.240 (97.736) Mem 22339MB +[2024-07-25 04:56:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.167) Loss 0.9058 (0.7297) Acc@1 77.686 (83.431) Acc@5 95.361 (96.691) Mem 22339MB +[2024-07-25 04:56:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.075 Acc@5 96.707 +[2024-07-25 04:56:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.1% +[2024-07-25 04:56:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.08% +[2024-07-25 04:56:48 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 04:56:50 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 04:56:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][0/625] eta 0:09:04 lr 0.000699 wd 0.0500 time 0.8715 (0.8715) data time 0.3522 (0.3522) model time 0.0000 (0.0000) loss 6.1260 (6.1260) grad_norm 3.1635 (3.1635) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:56:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][10/625] eta 0:06:10 lr 0.000699 wd 0.0500 time 0.5723 (0.6031) data time 0.0008 (0.0329) model time 0.0000 (0.0000) loss 8.2665 (7.4752) grad_norm 2.4423 (2.2851) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:57:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][20/625] eta 0:05:56 lr 0.000699 wd 0.0500 time 0.5717 (0.5898) data time 0.0008 (0.0176) model time 0.0000 (0.0000) loss 6.9671 (7.2574) grad_norm 3.4172 (2.3227) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:57:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][30/625] eta 0:05:48 lr 0.000699 wd 0.0500 time 0.5742 (0.5855) data time 0.0009 (0.0122) model time 0.0000 (0.0000) loss 6.6863 (7.2414) grad_norm 1.8394 (2.5152) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:57:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][40/625] eta 0:05:40 lr 0.000699 wd 0.0500 time 0.5640 (0.5828) data time 0.0006 (0.0095) model time 0.0000 (0.0000) loss 6.2454 (7.3672) grad_norm 2.1704 (2.5178) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:57:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][50/625] eta 0:05:34 lr 0.000698 wd 0.0500 time 0.5755 (0.5813) data time 0.0006 (0.0077) model time 0.0000 (0.0000) loss 6.3832 (7.3463) grad_norm 2.3347 (2.4972) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:57:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][60/625] eta 0:05:27 lr 0.000698 wd 0.0500 time 0.5683 (0.5802) data time 0.0006 (0.0066) model time 0.5677 (0.5740) loss 6.5920 (7.3772) grad_norm 1.5270 (2.4723) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:57:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][70/625] eta 0:05:21 lr 0.000698 wd 0.0500 time 0.5712 (0.5794) data time 0.0008 (0.0058) model time 0.5704 (0.5738) loss 8.5689 (7.4359) grad_norm 3.1066 (2.4337) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:57:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][80/625] eta 0:05:15 lr 0.000698 wd 0.0500 time 0.5720 (0.5795) data time 0.0006 (0.0052) model time 0.5713 (0.5757) loss 6.8017 (7.4105) grad_norm 2.5533 (2.4467) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 04:57:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][90/625] eta 0:05:11 lr 0.000698 wd 0.0500 time 0.5685 (0.5826) data time 0.0008 (0.0047) model time 0.5677 (0.5834) loss 6.6271 (7.4079) grad_norm 1.9696 (2.3926) loss_scale 1024.0000 (557.0110) mem 22339MB +[2024-07-25 04:57:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][100/625] eta 0:05:08 lr 0.000698 wd 0.0500 time 0.7794 (0.5868) data time 0.0006 (0.0043) model time 0.7788 (0.5916) loss 7.6230 (7.4260) grad_norm 1.7838 (2.3666) loss_scale 1024.0000 (603.2475) mem 22339MB +[2024-07-25 04:57:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][110/625] eta 0:05:03 lr 0.000698 wd 0.0500 time 0.6378 (0.5903) data time 0.0008 (0.0040) model time 0.6370 (0.5971) loss 8.5459 (7.3899) grad_norm 1.9013 (2.3599) loss_scale 1024.0000 (641.1532) mem 22339MB +[2024-07-25 04:58:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][120/625] eta 0:05:01 lr 0.000698 wd 0.0500 time 0.7227 (0.5977) data time 0.0006 (0.0037) model time 0.7220 (0.6088) loss 6.9175 (7.4092) grad_norm 2.0715 (2.3484) loss_scale 1024.0000 (672.7934) mem 22339MB +[2024-07-25 04:58:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][130/625] eta 0:04:55 lr 0.000698 wd 0.0500 time 0.5696 (0.5971) data time 0.0006 (0.0035) model time 0.5690 (0.6063) loss 8.1221 (7.4514) grad_norm 2.0147 (2.3583) loss_scale 1024.0000 (699.6031) mem 22339MB +[2024-07-25 04:58:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][140/625] eta 0:04:48 lr 0.000697 wd 0.0500 time 0.5678 (0.5954) data time 0.0007 (0.0033) model time 0.5672 (0.6026) loss 7.1474 (7.4731) grad_norm 2.4497 (2.3379) loss_scale 1024.0000 (722.6099) mem 22339MB +[2024-07-25 04:58:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][150/625] eta 0:04:42 lr 0.000697 wd 0.0500 time 0.5748 (0.5940) data time 0.0006 (0.0031) model time 0.5742 (0.5996) loss 6.4170 (7.4587) grad_norm 2.4835 (2.3444) loss_scale 1024.0000 (742.5695) mem 22339MB +[2024-07-25 04:58:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][160/625] eta 0:04:35 lr 0.000697 wd 0.0500 time 0.5700 (0.5928) data time 0.0008 (0.0030) model time 0.5692 (0.5973) loss 7.5270 (7.4521) grad_norm 2.0188 (2.3657) loss_scale 1024.0000 (760.0497) mem 22339MB +[2024-07-25 04:58:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][170/625] eta 0:04:29 lr 0.000697 wd 0.0500 time 0.5721 (0.5918) data time 0.0007 (0.0029) model time 0.5713 (0.5955) loss 8.1859 (7.4719) grad_norm 2.5187 (2.3644) loss_scale 1024.0000 (775.4854) mem 22339MB +[2024-07-25 04:58:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][180/625] eta 0:04:23 lr 0.000697 wd 0.0500 time 0.5745 (0.5910) data time 0.0006 (0.0027) model time 0.5738 (0.5940) loss 8.2476 (7.4836) grad_norm 2.4788 (2.3574) loss_scale 1024.0000 (789.2155) mem 22339MB +[2024-07-25 04:58:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][190/625] eta 0:04:16 lr 0.000697 wd 0.0500 time 0.5719 (0.5902) data time 0.0008 (0.0026) model time 0.5712 (0.5926) loss 7.9843 (7.4901) grad_norm 1.6862 (2.3414) loss_scale 1024.0000 (801.5079) mem 22339MB +[2024-07-25 04:58:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][200/625] eta 0:04:10 lr 0.000697 wd 0.0500 time 0.5705 (0.5895) data time 0.0006 (0.0026) model time 0.5699 (0.5915) loss 8.5334 (7.4954) grad_norm 2.5084 (2.3289) loss_scale 1024.0000 (812.5771) mem 22339MB +[2024-07-25 04:58:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][210/625] eta 0:04:04 lr 0.000697 wd 0.0500 time 0.5707 (0.5888) data time 0.0007 (0.0025) model time 0.5699 (0.5904) loss 9.3257 (7.5140) grad_norm 1.5688 (2.3172) loss_scale 1024.0000 (822.5972) mem 22339MB +[2024-07-25 04:59:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][220/625] eta 0:03:58 lr 0.000697 wd 0.0500 time 0.5709 (0.5883) data time 0.0006 (0.0024) model time 0.5702 (0.5895) loss 8.4953 (7.5146) grad_norm 2.3197 (2.3372) loss_scale 1024.0000 (831.7104) mem 22339MB +[2024-07-25 04:59:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][230/625] eta 0:03:52 lr 0.000696 wd 0.0500 time 0.5693 (0.5877) data time 0.0007 (0.0023) model time 0.5685 (0.5887) loss 7.6796 (7.5337) grad_norm 5.3918 (2.3889) loss_scale 1024.0000 (840.0346) mem 22339MB +[2024-07-25 04:59:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][240/625] eta 0:03:46 lr 0.000696 wd 0.0500 time 0.5743 (0.5871) data time 0.0007 (0.0023) model time 0.5737 (0.5879) loss 8.3706 (7.5271) grad_norm 1.7828 (2.4193) loss_scale 1024.0000 (847.6680) mem 22339MB +[2024-07-25 04:59:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][250/625] eta 0:03:40 lr 0.000696 wd 0.0500 time 0.5748 (0.5868) data time 0.0008 (0.0022) model time 0.5739 (0.5874) loss 8.4025 (7.5448) grad_norm 1.8861 (2.4253) loss_scale 1024.0000 (854.6932) mem 22339MB +[2024-07-25 04:59:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][260/625] eta 0:03:34 lr 0.000696 wd 0.0500 time 0.5800 (0.5865) data time 0.0008 (0.0021) model time 0.5792 (0.5870) loss 7.0676 (7.5496) grad_norm 3.0392 (2.4517) loss_scale 1024.0000 (861.1801) mem 22339MB +[2024-07-25 04:59:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][270/625] eta 0:03:28 lr 0.000696 wd 0.0500 time 0.5718 (0.5860) data time 0.0008 (0.0021) model time 0.5710 (0.5863) loss 9.1723 (7.5484) grad_norm 2.3007 (2.4827) loss_scale 1024.0000 (867.1882) mem 22339MB +[2024-07-25 04:59:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][280/625] eta 0:03:22 lr 0.000696 wd 0.0500 time 0.5705 (0.5856) data time 0.0006 (0.0021) model time 0.5699 (0.5857) loss 6.5445 (7.5350) grad_norm 1.8052 (2.4874) loss_scale 1024.0000 (872.7687) mem 22339MB +[2024-07-25 04:59:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][290/625] eta 0:03:16 lr 0.000696 wd 0.0500 time 0.5760 (0.5853) data time 0.0007 (0.0021) model time 0.5753 (0.5853) loss 7.6981 (7.5245) grad_norm 2.5913 (2.4793) loss_scale 1024.0000 (877.9656) mem 22339MB +[2024-07-25 04:59:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][300/625] eta 0:03:10 lr 0.000696 wd 0.0500 time 0.5748 (0.5854) data time 0.0006 (0.0020) model time 0.5742 (0.5854) loss 7.8439 (7.5193) grad_norm 3.1145 (2.4672) loss_scale 1024.0000 (882.8173) mem 22339MB +[2024-07-25 04:59:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][310/625] eta 0:03:04 lr 0.000696 wd 0.0500 time 0.5701 (0.5860) data time 0.0007 (0.0020) model time 0.5694 (0.5860) loss 7.4220 (7.5262) grad_norm 2.6833 (2.4657) loss_scale 1024.0000 (887.3569) mem 22339MB +[2024-07-25 04:59:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][320/625] eta 0:02:59 lr 0.000696 wd 0.0500 time 0.7707 (0.5876) data time 0.0008 (0.0020) model time 0.7699 (0.5879) loss 6.3107 (7.5082) grad_norm 2.0873 (2.4457) loss_scale 1024.0000 (891.6137) mem 22339MB +[2024-07-25 05:00:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][330/625] eta 0:02:53 lr 0.000695 wd 0.0500 time 0.7105 (0.5890) data time 0.0009 (0.0020) model time 0.7097 (0.5895) loss 8.3659 (7.5106) grad_norm 2.3741 (2.4377) loss_scale 1024.0000 (895.6133) mem 22339MB +[2024-07-25 05:00:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][340/625] eta 0:02:48 lr 0.000695 wd 0.0500 time 0.5699 (0.5902) data time 0.0008 (0.0019) model time 0.5690 (0.5908) loss 7.2094 (7.5089) grad_norm 3.3990 (2.4325) loss_scale 1024.0000 (899.3783) mem 22339MB +[2024-07-25 05:00:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][350/625] eta 0:02:42 lr 0.000695 wd 0.0500 time 0.5721 (0.5902) data time 0.0006 (0.0019) model time 0.5715 (0.5908) loss 6.6023 (7.4998) grad_norm 2.0098 (2.4306) loss_scale 1024.0000 (902.9288) mem 22339MB +[2024-07-25 05:00:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][360/625] eta 0:02:36 lr 0.000695 wd 0.0500 time 0.5673 (0.5897) data time 0.0009 (0.0019) model time 0.5664 (0.5902) loss 6.1418 (7.4952) grad_norm 1.9880 (2.4297) loss_scale 1024.0000 (906.2825) mem 22339MB +[2024-07-25 05:00:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][370/625] eta 0:02:30 lr 0.000695 wd 0.0500 time 0.5720 (0.5893) data time 0.0006 (0.0018) model time 0.5714 (0.5897) loss 6.4202 (7.4955) grad_norm 1.5793 (2.4165) loss_scale 1024.0000 (909.4555) mem 22339MB +[2024-07-25 05:00:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][380/625] eta 0:02:24 lr 0.000695 wd 0.0500 time 0.5711 (0.5890) data time 0.0008 (0.0018) model time 0.5704 (0.5892) loss 7.1921 (7.4946) grad_norm 1.9937 (2.4082) loss_scale 1024.0000 (912.4619) mem 22339MB +[2024-07-25 05:00:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][390/625] eta 0:02:18 lr 0.000695 wd 0.0500 time 0.5712 (0.5886) data time 0.0006 (0.0018) model time 0.5706 (0.5888) loss 7.4105 (7.4943) grad_norm 2.4873 (2.4090) loss_scale 1024.0000 (915.3146) mem 22339MB +[2024-07-25 05:00:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][400/625] eta 0:02:12 lr 0.000695 wd 0.0500 time 0.5732 (0.5883) data time 0.0008 (0.0018) model time 0.5724 (0.5884) loss 8.4700 (7.4844) grad_norm 2.2456 (2.4221) loss_scale 1024.0000 (918.0249) mem 22339MB +[2024-07-25 05:00:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][410/625] eta 0:02:06 lr 0.000695 wd 0.0500 time 0.5684 (0.5879) data time 0.0008 (0.0017) model time 0.5676 (0.5880) loss 6.7960 (7.4725) grad_norm 2.1322 (2.4196) loss_scale 1024.0000 (920.6034) mem 22339MB +[2024-07-25 05:00:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][420/625] eta 0:02:00 lr 0.000694 wd 0.0500 time 0.5727 (0.5876) data time 0.0007 (0.0017) model time 0.5719 (0.5876) loss 7.2880 (7.4721) grad_norm 1.8763 (2.4162) loss_scale 1024.0000 (923.0594) mem 22339MB +[2024-07-25 05:01:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][430/625] eta 0:01:54 lr 0.000694 wd 0.0500 time 0.5719 (0.5873) data time 0.0008 (0.0017) model time 0.5711 (0.5872) loss 8.2624 (7.4832) grad_norm 2.9945 (2.4090) loss_scale 1024.0000 (925.4014) mem 22339MB +[2024-07-25 05:01:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][440/625] eta 0:01:48 lr 0.000694 wd 0.0500 time 0.5738 (0.5870) data time 0.0008 (0.0017) model time 0.5730 (0.5869) loss 7.4407 (7.4876) grad_norm 1.8220 (2.3999) loss_scale 1024.0000 (927.6372) mem 22339MB +[2024-07-25 05:01:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][450/625] eta 0:01:42 lr 0.000694 wd 0.0500 time 0.5749 (0.5868) data time 0.0006 (0.0017) model time 0.5744 (0.5866) loss 8.6228 (7.4841) grad_norm 2.3115 (2.3983) loss_scale 1024.0000 (929.7738) mem 22339MB +[2024-07-25 05:01:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][460/625] eta 0:01:36 lr 0.000694 wd 0.0500 time 0.5702 (0.5865) data time 0.0008 (0.0016) model time 0.5694 (0.5863) loss 7.4190 (7.4812) grad_norm 1.9303 (2.3925) loss_scale 1024.0000 (931.8178) mem 22339MB +[2024-07-25 05:01:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][470/625] eta 0:01:30 lr 0.000694 wd 0.0500 time 0.5667 (0.5863) data time 0.0006 (0.0016) model time 0.5660 (0.5860) loss 8.9562 (7.4835) grad_norm 1.6566 (2.3856) loss_scale 1024.0000 (933.7749) mem 22339MB +[2024-07-25 05:01:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][480/625] eta 0:01:24 lr 0.000694 wd 0.0500 time 0.5649 (0.5861) data time 0.0008 (0.0016) model time 0.5641 (0.5858) loss 8.2794 (7.4878) grad_norm 3.3218 (2.4040) loss_scale 1024.0000 (935.6507) mem 22339MB +[2024-07-25 05:01:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][490/625] eta 0:01:19 lr 0.000694 wd 0.0500 time 0.5652 (0.5859) data time 0.0007 (0.0016) model time 0.5645 (0.5855) loss 6.0509 (7.4878) grad_norm 2.3401 (2.4161) loss_scale 1024.0000 (937.4501) mem 22339MB +[2024-07-25 05:01:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][500/625] eta 0:01:13 lr 0.000694 wd 0.0500 time 0.5746 (0.5857) data time 0.0006 (0.0016) model time 0.5741 (0.5853) loss 7.5444 (7.4944) grad_norm 2.8286 (2.4131) loss_scale 1024.0000 (939.1776) mem 22339MB +[2024-07-25 05:01:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][510/625] eta 0:01:07 lr 0.000694 wd 0.0500 time 0.5725 (0.5855) data time 0.0006 (0.0016) model time 0.5719 (0.5851) loss 7.8612 (7.4912) grad_norm 2.1970 (2.4148) loss_scale 1024.0000 (940.8376) mem 22339MB +[2024-07-25 05:01:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][520/625] eta 0:01:01 lr 0.000693 wd 0.0500 time 0.7118 (0.5856) data time 0.0008 (0.0015) model time 0.7109 (0.5852) loss 5.9202 (7.4892) grad_norm 1.8770 (2.4219) loss_scale 1024.0000 (942.4338) mem 22339MB +[2024-07-25 05:02:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][530/625] eta 0:00:55 lr 0.000693 wd 0.0500 time 0.5719 (0.5856) data time 0.0006 (0.0015) model time 0.5713 (0.5852) loss 6.4495 (7.4869) grad_norm 2.3962 (2.4303) loss_scale 1024.0000 (943.9699) mem 22339MB +[2024-07-25 05:02:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][540/625] eta 0:00:49 lr 0.000693 wd 0.0500 time 0.7050 (0.5866) data time 0.0008 (0.0015) model time 0.7042 (0.5863) loss 7.0796 (7.4782) grad_norm 1.8842 (2.4334) loss_scale 1024.0000 (945.4492) mem 22339MB +[2024-07-25 05:02:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][550/625] eta 0:00:44 lr 0.000693 wd 0.0500 time 0.7515 (0.5881) data time 0.0008 (0.0015) model time 0.7507 (0.5879) loss 7.5791 (7.4839) grad_norm 2.7507 (2.4361) loss_scale 1024.0000 (946.8748) mem 22339MB +[2024-07-25 05:02:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][560/625] eta 0:00:38 lr 0.000693 wd 0.0500 time 0.7183 (0.5894) data time 0.0006 (0.0015) model time 0.7177 (0.5894) loss 6.8794 (7.4856) grad_norm 3.6324 (2.4405) loss_scale 1024.0000 (948.2496) mem 22339MB +[2024-07-25 05:02:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][570/625] eta 0:00:32 lr 0.000693 wd 0.0500 time 0.5770 (0.5893) data time 0.0006 (0.0015) model time 0.5765 (0.5892) loss 8.3627 (7.4891) grad_norm 2.3697 (2.4346) loss_scale 1024.0000 (949.5762) mem 22339MB +[2024-07-25 05:02:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][580/625] eta 0:00:26 lr 0.000693 wd 0.0500 time 0.5705 (0.5891) data time 0.0006 (0.0015) model time 0.5699 (0.5889) loss 5.7907 (7.4829) grad_norm 1.8014 (2.4304) loss_scale 1024.0000 (950.8571) mem 22339MB +[2024-07-25 05:02:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][590/625] eta 0:00:20 lr 0.000693 wd 0.0500 time 0.5747 (0.5888) data time 0.0008 (0.0014) model time 0.5739 (0.5887) loss 8.1113 (7.4850) grad_norm 1.6316 (2.4258) loss_scale 1024.0000 (952.0948) mem 22339MB +[2024-07-25 05:02:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][600/625] eta 0:00:14 lr 0.000693 wd 0.0500 time 0.5616 (0.5886) data time 0.0006 (0.0014) model time 0.5610 (0.5885) loss 7.8395 (7.4877) grad_norm 1.7407 (2.4205) loss_scale 1024.0000 (953.2912) mem 22339MB +[2024-07-25 05:02:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][610/625] eta 0:00:08 lr 0.000692 wd 0.0500 time 0.5715 (0.5884) data time 0.0004 (0.0014) model time 0.5711 (0.5882) loss 7.0203 (7.4958) grad_norm 2.1077 (2.4144) loss_scale 1024.0000 (954.4484) mem 22339MB +[2024-07-25 05:02:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [146/300][620/625] eta 0:00:02 lr 0.000692 wd 0.0500 time 0.5753 (0.5882) data time 0.0004 (0.0014) model time 0.5749 (0.5880) loss 8.1512 (7.4994) grad_norm 2.4988 (2.4117) loss_scale 1024.0000 (955.5684) mem 22339MB +[2024-07-25 05:02:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 146 training takes 0:06:07 +[2024-07-25 05:02:57 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 05:02:59 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 05:02:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.482 (0.482) Loss 0.5137 (0.5137) Acc@1 89.600 (89.600) Acc@5 98.438 (98.438) Mem 22339MB +[2024-07-25 05:03:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.8169 (0.6418) Acc@1 80.664 (85.760) Acc@5 96.094 (97.638) Mem 22339MB +[2024-07-25 05:03:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.128 (0.143) Loss 0.9165 (0.7562) Acc@1 77.686 (82.752) Acc@5 95.020 (96.482) Mem 22339MB +[2024-07-25 05:03:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.446 Acc@5 96.463 +[2024-07-25 05:03:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.4% +[2024-07-25 05:03:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.891 (0.891) Loss 0.4927 (0.4927) Acc@1 89.795 (89.795) Acc@5 98.779 (98.779) Mem 22339MB +[2024-07-25 05:03:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.196) Loss 0.7759 (0.6246) Acc@1 81.445 (86.617) Acc@5 96.240 (97.732) Mem 22339MB +[2024-07-25 05:03:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.162) Loss 0.9048 (0.7293) Acc@1 77.783 (83.440) Acc@5 95.361 (96.687) Mem 22339MB +[2024-07-25 05:03:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.087 Acc@5 96.699 +[2024-07-25 05:03:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.1% +[2024-07-25 05:03:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.09% +[2024-07-25 05:03:06 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 05:03:08 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 05:03:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][0/625] eta 0:09:44 lr 0.000692 wd 0.0500 time 0.9353 (0.9353) data time 0.4185 (0.4185) model time 0.0000 (0.0000) loss 8.1950 (8.1950) grad_norm 2.2960 (2.2960) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:03:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][10/625] eta 0:06:13 lr 0.000692 wd 0.0500 time 0.5735 (0.6073) data time 0.0008 (0.0388) model time 0.0000 (0.0000) loss 8.4213 (7.5137) grad_norm 2.1014 (2.1473) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:03:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][20/625] eta 0:05:58 lr 0.000692 wd 0.0500 time 0.5713 (0.5917) data time 0.0006 (0.0207) model time 0.0000 (0.0000) loss 7.4119 (7.1913) grad_norm 2.0979 (2.0785) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:03:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][30/625] eta 0:05:48 lr 0.000692 wd 0.0500 time 0.5725 (0.5864) data time 0.0008 (0.0143) model time 0.0000 (0.0000) loss 8.4720 (7.3165) grad_norm 1.7343 (2.0737) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:03:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][40/625] eta 0:05:41 lr 0.000692 wd 0.0500 time 0.5791 (0.5840) data time 0.0006 (0.0110) model time 0.0000 (0.0000) loss 6.1895 (7.3024) grad_norm 1.6197 (2.0464) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:03:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][50/625] eta 0:05:34 lr 0.000692 wd 0.0500 time 0.5731 (0.5820) data time 0.0006 (0.0090) model time 0.0000 (0.0000) loss 6.9601 (7.3178) grad_norm 2.3633 (2.1379) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:03:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][60/625] eta 0:05:28 lr 0.000692 wd 0.0500 time 0.5716 (0.5809) data time 0.0008 (0.0077) model time 0.5708 (0.5743) loss 8.4863 (7.3538) grad_norm 2.6684 (2.2723) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:03:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][70/625] eta 0:05:22 lr 0.000692 wd 0.0500 time 0.5708 (0.5804) data time 0.0008 (0.0067) model time 0.5701 (0.5754) loss 6.7937 (7.3264) grad_norm 1.8564 (2.3016) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:03:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][80/625] eta 0:05:16 lr 0.000691 wd 0.0500 time 0.5745 (0.5799) data time 0.0006 (0.0060) model time 0.5738 (0.5755) loss 6.0135 (7.3411) grad_norm 4.0757 (2.2886) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:04:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][90/625] eta 0:05:09 lr 0.000691 wd 0.0500 time 0.5709 (0.5793) data time 0.0006 (0.0054) model time 0.5702 (0.5751) loss 6.6451 (7.3308) grad_norm 2.1599 (2.2942) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:04:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][100/625] eta 0:05:04 lr 0.000691 wd 0.0500 time 0.5706 (0.5806) data time 0.0008 (0.0049) model time 0.5699 (0.5783) loss 8.7541 (7.3422) grad_norm 2.7699 (2.2893) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:04:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][110/625] eta 0:04:58 lr 0.000691 wd 0.0500 time 0.5740 (0.5802) data time 0.0006 (0.0046) model time 0.5735 (0.5779) loss 6.4916 (7.3542) grad_norm 3.0479 (2.3175) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:04:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][120/625] eta 0:04:53 lr 0.000691 wd 0.0500 time 0.7171 (0.5813) data time 0.0007 (0.0043) model time 0.7164 (0.5800) loss 8.2994 (7.3651) grad_norm 3.3020 (2.3327) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:04:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][130/625] eta 0:04:48 lr 0.000691 wd 0.0500 time 0.5703 (0.5825) data time 0.0006 (0.0040) model time 0.5697 (0.5820) loss 7.2346 (7.4056) grad_norm 1.8966 (2.3332) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:04:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][140/625] eta 0:04:44 lr 0.000691 wd 0.0500 time 0.5711 (0.5871) data time 0.0007 (0.0038) model time 0.5704 (0.5891) loss 7.5352 (7.3630) grad_norm 2.5828 (2.3210) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:04:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][150/625] eta 0:04:41 lr 0.000691 wd 0.0500 time 0.7457 (0.5917) data time 0.0006 (0.0036) model time 0.7451 (0.5958) loss 7.3661 (7.3632) grad_norm 2.2930 (2.3177) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:04:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][160/625] eta 0:04:36 lr 0.000691 wd 0.0500 time 0.5701 (0.5945) data time 0.0006 (0.0034) model time 0.5695 (0.5994) loss 7.9277 (7.3645) grad_norm 1.5261 (2.3239) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:04:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][170/625] eta 0:04:29 lr 0.000691 wd 0.0500 time 0.5697 (0.5932) data time 0.0007 (0.0032) model time 0.5689 (0.5971) loss 8.6743 (7.3692) grad_norm 1.7640 (2.3204) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:04:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][180/625] eta 0:04:23 lr 0.000690 wd 0.0500 time 0.5721 (0.5922) data time 0.0006 (0.0031) model time 0.5715 (0.5953) loss 6.5648 (7.3793) grad_norm 2.9729 (2.3111) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:05:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][190/625] eta 0:04:17 lr 0.000690 wd 0.0500 time 0.5624 (0.5913) data time 0.0006 (0.0030) model time 0.5617 (0.5938) loss 7.7702 (7.3709) grad_norm 2.7241 (2.3235) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:05:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][200/625] eta 0:04:10 lr 0.000690 wd 0.0500 time 0.5717 (0.5905) data time 0.0008 (0.0029) model time 0.5709 (0.5925) loss 8.4721 (7.3615) grad_norm 1.7754 (2.3061) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:05:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][210/625] eta 0:04:04 lr 0.000690 wd 0.0500 time 0.5714 (0.5898) data time 0.0008 (0.0028) model time 0.5707 (0.5915) loss 8.2743 (7.3613) grad_norm 3.1263 (2.3030) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:05:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][220/625] eta 0:03:58 lr 0.000690 wd 0.0500 time 0.5749 (0.5892) data time 0.0008 (0.0027) model time 0.5741 (0.5906) loss 8.5289 (7.3906) grad_norm 2.2428 (2.2997) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:05:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][230/625] eta 0:03:52 lr 0.000690 wd 0.0500 time 0.5631 (0.5886) data time 0.0008 (0.0026) model time 0.5623 (0.5897) loss 7.7861 (7.3757) grad_norm 2.3679 (2.3104) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:05:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][240/625] eta 0:03:46 lr 0.000690 wd 0.0500 time 0.5629 (0.5881) data time 0.0008 (0.0025) model time 0.5621 (0.5889) loss 5.8635 (7.3749) grad_norm 2.2142 (2.3059) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:05:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][250/625] eta 0:03:40 lr 0.000690 wd 0.0500 time 0.5607 (0.5879) data time 0.0006 (0.0025) model time 0.5601 (0.5886) loss 8.0125 (7.3786) grad_norm 3.2684 (2.2970) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:05:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][260/625] eta 0:03:34 lr 0.000690 wd 0.0500 time 0.5732 (0.5874) data time 0.0008 (0.0024) model time 0.5724 (0.5879) loss 7.6958 (7.3652) grad_norm 2.3861 (2.3013) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:05:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][270/625] eta 0:03:28 lr 0.000689 wd 0.0500 time 0.5744 (0.5870) data time 0.0008 (0.0024) model time 0.5736 (0.5873) loss 7.2607 (7.3699) grad_norm 1.7986 (2.2948) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:05:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][280/625] eta 0:03:22 lr 0.000689 wd 0.0500 time 0.5737 (0.5867) data time 0.0008 (0.0023) model time 0.5730 (0.5869) loss 6.2379 (7.3737) grad_norm 1.7433 (2.2793) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:05:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][290/625] eta 0:03:16 lr 0.000689 wd 0.0500 time 0.5710 (0.5863) data time 0.0006 (0.0022) model time 0.5704 (0.5864) loss 6.2788 (7.3770) grad_norm 3.1934 (2.2810) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:06:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][300/625] eta 0:03:10 lr 0.000689 wd 0.0500 time 0.5737 (0.5860) data time 0.0008 (0.0022) model time 0.5729 (0.5860) loss 8.1668 (7.3830) grad_norm 1.6620 (2.2959) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:06:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][310/625] eta 0:03:04 lr 0.000689 wd 0.0500 time 0.5724 (0.5857) data time 0.0007 (0.0021) model time 0.5717 (0.5856) loss 8.8474 (7.3720) grad_norm 1.5408 (2.2963) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:06:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][320/625] eta 0:02:58 lr 0.000689 wd 0.0500 time 0.5732 (0.5858) data time 0.0007 (0.0021) model time 0.5725 (0.5857) loss 7.1109 (7.3777) grad_norm 1.5395 (2.2886) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:06:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][330/625] eta 0:02:52 lr 0.000689 wd 0.0500 time 0.5725 (0.5855) data time 0.0008 (0.0021) model time 0.5716 (0.5853) loss 6.3636 (7.3927) grad_norm 1.6830 (2.3011) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:06:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][340/625] eta 0:02:46 lr 0.000689 wd 0.0500 time 0.5791 (0.5854) data time 0.0006 (0.0020) model time 0.5785 (0.5851) loss 6.2447 (7.4059) grad_norm 2.5894 (2.3116) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:06:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][350/625] eta 0:02:41 lr 0.000689 wd 0.0500 time 0.7764 (0.5868) data time 0.0006 (0.0020) model time 0.7758 (0.5868) loss 6.5099 (7.3957) grad_norm 2.6673 (2.3234) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:06:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][360/625] eta 0:02:35 lr 0.000689 wd 0.0500 time 0.6764 (0.5879) data time 0.0006 (0.0020) model time 0.6757 (0.5880) loss 7.8781 (7.3883) grad_norm 2.4339 (2.3204) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:06:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][370/625] eta 0:02:30 lr 0.000688 wd 0.0500 time 0.7411 (0.5893) data time 0.0006 (0.0020) model time 0.7405 (0.5896) loss 6.5063 (7.3855) grad_norm 2.1198 (2.3153) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:06:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][380/625] eta 0:02:24 lr 0.000688 wd 0.0500 time 0.5719 (0.5903) data time 0.0008 (0.0019) model time 0.5711 (0.5907) loss 6.2379 (7.3821) grad_norm 1.9248 (2.3028) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:06:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][390/625] eta 0:02:18 lr 0.000688 wd 0.0500 time 0.5763 (0.5899) data time 0.0008 (0.0019) model time 0.5755 (0.5902) loss 8.8352 (7.3879) grad_norm 1.7974 (2.2976) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:07:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][400/625] eta 0:02:12 lr 0.000688 wd 0.0500 time 0.5709 (0.5895) data time 0.0007 (0.0019) model time 0.5702 (0.5898) loss 7.9335 (7.3894) grad_norm 2.4278 (2.2946) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:07:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][410/625] eta 0:02:06 lr 0.000688 wd 0.0500 time 0.5627 (0.5893) data time 0.0007 (0.0018) model time 0.5620 (0.5895) loss 7.2886 (7.3989) grad_norm 2.4596 (2.3019) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:07:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][420/625] eta 0:02:00 lr 0.000688 wd 0.0500 time 0.5731 (0.5889) data time 0.0008 (0.0018) model time 0.5723 (0.5891) loss 6.9223 (7.3959) grad_norm 2.1871 (2.3015) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:07:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][430/625] eta 0:01:54 lr 0.000688 wd 0.0500 time 0.5611 (0.5886) data time 0.0008 (0.0018) model time 0.5603 (0.5887) loss 7.2503 (7.4012) grad_norm 2.0549 (2.3119) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:07:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][440/625] eta 0:01:48 lr 0.000688 wd 0.0500 time 0.5645 (0.5885) data time 0.0009 (0.0018) model time 0.5636 (0.5885) loss 6.5049 (7.4108) grad_norm 1.9547 (2.3082) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:07:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][450/625] eta 0:01:42 lr 0.000688 wd 0.0500 time 0.5733 (0.5882) data time 0.0006 (0.0018) model time 0.5727 (0.5881) loss 7.1095 (7.4097) grad_norm 3.1324 (2.3168) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:07:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][460/625] eta 0:01:37 lr 0.000687 wd 0.0500 time 0.5751 (0.5879) data time 0.0008 (0.0017) model time 0.5744 (0.5878) loss 7.6533 (7.4061) grad_norm 2.0281 (2.3167) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:07:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][470/625] eta 0:01:31 lr 0.000687 wd 0.0500 time 0.5751 (0.5877) data time 0.0006 (0.0017) model time 0.5745 (0.5875) loss 8.3979 (7.4164) grad_norm 2.5270 (2.3101) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:07:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][480/625] eta 0:01:25 lr 0.000687 wd 0.0500 time 0.5749 (0.5874) data time 0.0006 (0.0017) model time 0.5743 (0.5872) loss 8.0904 (7.4051) grad_norm 1.7775 (2.3070) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:07:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][490/625] eta 0:01:19 lr 0.000687 wd 0.0500 time 0.5695 (0.5872) data time 0.0006 (0.0017) model time 0.5688 (0.5869) loss 6.0672 (7.4033) grad_norm 3.0165 (2.3032) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:08:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][500/625] eta 0:01:13 lr 0.000687 wd 0.0500 time 0.5712 (0.5869) data time 0.0008 (0.0017) model time 0.5704 (0.5866) loss 8.2254 (7.4050) grad_norm 1.9363 (2.2988) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:08:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][510/625] eta 0:01:07 lr 0.000687 wd 0.0500 time 0.5717 (0.5867) data time 0.0009 (0.0017) model time 0.5709 (0.5864) loss 8.2231 (7.4012) grad_norm 2.4167 (2.3077) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:08:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][520/625] eta 0:01:01 lr 0.000687 wd 0.0500 time 0.5747 (0.5865) data time 0.0006 (0.0017) model time 0.5741 (0.5862) loss 7.2091 (7.3990) grad_norm 1.9474 (2.3076) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:08:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][530/625] eta 0:00:55 lr 0.000687 wd 0.0500 time 0.5685 (0.5864) data time 0.0006 (0.0016) model time 0.5679 (0.5860) loss 8.1054 (7.4027) grad_norm 4.8675 (2.3159) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:08:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][540/625] eta 0:00:49 lr 0.000687 wd 0.0500 time 0.5738 (0.5864) data time 0.0006 (0.0016) model time 0.5732 (0.5860) loss 6.5927 (7.4100) grad_norm 2.4647 (2.3184) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:08:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][550/625] eta 0:00:43 lr 0.000687 wd 0.0500 time 0.5716 (0.5862) data time 0.0006 (0.0016) model time 0.5710 (0.5857) loss 8.2195 (7.4084) grad_norm 1.7702 (2.3107) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:08:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][560/625] eta 0:00:38 lr 0.000686 wd 0.0500 time 0.5841 (0.5862) data time 0.0008 (0.0016) model time 0.5833 (0.5858) loss 7.6490 (7.4061) grad_norm 1.9998 (2.3082) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:08:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][570/625] eta 0:00:32 lr 0.000686 wd 0.0500 time 0.5695 (0.5866) data time 0.0008 (0.0016) model time 0.5687 (0.5862) loss 8.7023 (7.4120) grad_norm 2.2809 (2.3061) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:08:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][580/625] eta 0:00:26 lr 0.000686 wd 0.0500 time 0.7288 (0.5877) data time 0.0006 (0.0016) model time 0.7282 (0.5874) loss 8.0284 (7.4113) grad_norm 2.0889 (2.3031) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:08:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][590/625] eta 0:00:20 lr 0.000686 wd 0.0500 time 0.7283 (0.5883) data time 0.0008 (0.0016) model time 0.7275 (0.5880) loss 6.8403 (7.4188) grad_norm 2.5584 (2.3028) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:09:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][600/625] eta 0:00:14 lr 0.000686 wd 0.0500 time 0.5739 (0.5889) data time 0.0009 (0.0015) model time 0.5729 (0.5887) loss 6.6124 (7.4186) grad_norm 2.0019 (2.2992) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:09:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][610/625] eta 0:00:08 lr 0.000686 wd 0.0500 time 0.5721 (0.5887) data time 0.0006 (0.0015) model time 0.5715 (0.5885) loss 8.1506 (7.4175) grad_norm 3.0142 (2.2973) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:09:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [147/300][620/625] eta 0:00:02 lr 0.000686 wd 0.0500 time 0.5634 (0.5885) data time 0.0005 (0.0015) model time 0.5629 (0.5882) loss 7.2241 (7.4161) grad_norm 1.9719 (2.3037) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:09:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 147 training takes 0:06:07 +[2024-07-25 05:09:16 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 05:09:17 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 05:09:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.483 (0.483) Loss 0.5190 (0.5190) Acc@1 89.746 (89.746) Acc@5 98.633 (98.633) Mem 22339MB +[2024-07-25 05:09:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.8174 (0.6543) Acc@1 81.348 (86.133) Acc@5 96.191 (97.603) Mem 22339MB +[2024-07-25 05:09:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.143) Loss 0.9097 (0.7608) Acc@1 77.539 (82.989) Acc@5 95.361 (96.491) Mem 22339MB +[2024-07-25 05:09:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.618 Acc@5 96.441 +[2024-07-25 05:09:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.6% +[2024-07-25 05:09:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 82.62% +[2024-07-25 05:09:21 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 05:09:22 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 05:09:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.474 (0.474) Loss 0.4932 (0.4932) Acc@1 89.746 (89.746) Acc@5 98.779 (98.779) Mem 22339MB +[2024-07-25 05:09:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.7754 (0.6246) Acc@1 81.543 (86.630) Acc@5 96.240 (97.723) Mem 22339MB +[2024-07-25 05:09:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9043 (0.7289) Acc@1 77.979 (83.471) Acc@5 95.361 (96.696) Mem 22339MB +[2024-07-25 05:09:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.121 Acc@5 96.709 +[2024-07-25 05:09:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.1% +[2024-07-25 05:09:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.12% +[2024-07-25 05:09:26 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 05:09:27 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 05:09:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][0/625] eta 0:09:11 lr 0.000686 wd 0.0500 time 0.8826 (0.8826) data time 0.3665 (0.3665) model time 0.0000 (0.0000) loss 7.2681 (7.2681) grad_norm 2.7101 (2.7101) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:09:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][10/625] eta 0:06:10 lr 0.000686 wd 0.0500 time 0.5723 (0.6029) data time 0.0006 (0.0340) model time 0.0000 (0.0000) loss 7.8458 (7.6110) grad_norm 1.8315 (2.2558) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:09:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][20/625] eta 0:05:57 lr 0.000686 wd 0.0500 time 0.5728 (0.5913) data time 0.0006 (0.0185) model time 0.0000 (0.0000) loss 8.3190 (7.7862) grad_norm 2.4746 (2.4088) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:09:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][30/625] eta 0:05:48 lr 0.000685 wd 0.0500 time 0.5617 (0.5864) data time 0.0008 (0.0128) model time 0.0000 (0.0000) loss 9.0739 (7.7701) grad_norm 1.8103 (2.3270) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:09:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][40/625] eta 0:05:43 lr 0.000685 wd 0.0500 time 0.5717 (0.5875) data time 0.0008 (0.0099) model time 0.0000 (0.0000) loss 8.6676 (7.6990) grad_norm 1.7693 (2.3158) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:09:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][50/625] eta 0:05:36 lr 0.000685 wd 0.0500 time 0.5733 (0.5849) data time 0.0007 (0.0081) model time 0.0000 (0.0000) loss 8.1615 (7.6358) grad_norm 2.0320 (2.3131) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:10:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][60/625] eta 0:05:29 lr 0.000685 wd 0.0500 time 0.5728 (0.5831) data time 0.0006 (0.0069) model time 0.5722 (0.5736) loss 8.1653 (7.6077) grad_norm 2.3361 (2.2660) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:10:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][70/625] eta 0:05:22 lr 0.000685 wd 0.0500 time 0.5749 (0.5820) data time 0.0008 (0.0060) model time 0.5741 (0.5738) loss 6.3424 (7.5523) grad_norm 2.0216 (2.2687) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:10:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][80/625] eta 0:05:16 lr 0.000685 wd 0.0500 time 0.5746 (0.5811) data time 0.0008 (0.0054) model time 0.5738 (0.5738) loss 7.9443 (7.5408) grad_norm 3.3297 (2.3028) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:10:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][90/625] eta 0:05:10 lr 0.000685 wd 0.0500 time 0.5734 (0.5803) data time 0.0007 (0.0049) model time 0.5727 (0.5737) loss 5.7989 (7.4700) grad_norm 2.8462 (2.3285) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:10:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][100/625] eta 0:05:04 lr 0.000685 wd 0.0500 time 0.5723 (0.5798) data time 0.0009 (0.0045) model time 0.5715 (0.5738) loss 7.4222 (7.4786) grad_norm 1.6671 (2.3230) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:10:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][110/625] eta 0:04:58 lr 0.000685 wd 0.0500 time 0.5703 (0.5795) data time 0.0008 (0.0042) model time 0.5695 (0.5741) loss 6.0290 (7.5102) grad_norm 2.6359 (2.3717) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:10:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][120/625] eta 0:04:52 lr 0.000684 wd 0.0500 time 0.5709 (0.5790) data time 0.0008 (0.0039) model time 0.5701 (0.5740) loss 7.4181 (7.5067) grad_norm 1.4716 (2.3462) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:10:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][130/625] eta 0:04:46 lr 0.000684 wd 0.0500 time 0.5689 (0.5787) data time 0.0008 (0.0036) model time 0.5681 (0.5739) loss 6.9153 (7.5071) grad_norm 2.1178 (2.3327) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:10:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][140/625] eta 0:04:40 lr 0.000684 wd 0.0500 time 0.5777 (0.5784) data time 0.0008 (0.0034) model time 0.5769 (0.5739) loss 7.0785 (7.4575) grad_norm 1.6804 (2.3334) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:10:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][150/625] eta 0:04:34 lr 0.000684 wd 0.0500 time 0.5745 (0.5781) data time 0.0009 (0.0033) model time 0.5736 (0.5739) loss 7.8958 (7.4833) grad_norm 1.7745 (2.3440) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:11:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][160/625] eta 0:04:29 lr 0.000684 wd 0.0500 time 0.7140 (0.5802) data time 0.0006 (0.0031) model time 0.7134 (0.5773) loss 6.1749 (7.4894) grad_norm 2.6117 (2.3526) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:11:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][170/625] eta 0:04:25 lr 0.000684 wd 0.0500 time 0.5719 (0.5826) data time 0.0008 (0.0030) model time 0.5711 (0.5809) loss 8.8788 (7.4921) grad_norm 1.8125 (2.3514) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:11:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][180/625] eta 0:04:20 lr 0.000684 wd 0.0500 time 0.5712 (0.5852) data time 0.0007 (0.0029) model time 0.5705 (0.5846) loss 7.6148 (7.4780) grad_norm 3.0462 (2.3902) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:11:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][190/625] eta 0:04:16 lr 0.000684 wd 0.0500 time 0.5706 (0.5898) data time 0.0007 (0.0028) model time 0.5699 (0.5908) loss 6.0399 (7.5005) grad_norm 1.6921 (2.3800) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:11:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][200/625] eta 0:04:10 lr 0.000684 wd 0.0500 time 0.5715 (0.5904) data time 0.0008 (0.0027) model time 0.5707 (0.5915) loss 7.1546 (7.5007) grad_norm 2.2498 (2.3994) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:11:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][210/625] eta 0:04:04 lr 0.000684 wd 0.0500 time 0.5717 (0.5897) data time 0.0007 (0.0026) model time 0.5710 (0.5904) loss 6.3907 (7.4866) grad_norm 3.0139 (2.3926) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:11:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][220/625] eta 0:03:58 lr 0.000683 wd 0.0500 time 0.5683 (0.5890) data time 0.0006 (0.0025) model time 0.5677 (0.5895) loss 6.9090 (7.4593) grad_norm 2.2761 (2.3752) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:11:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][230/625] eta 0:03:52 lr 0.000683 wd 0.0500 time 0.5711 (0.5884) data time 0.0006 (0.0024) model time 0.5704 (0.5886) loss 6.7116 (7.4405) grad_norm 1.8038 (2.3883) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:11:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][240/625] eta 0:03:46 lr 0.000683 wd 0.0500 time 0.5713 (0.5879) data time 0.0008 (0.0024) model time 0.5706 (0.5878) loss 7.9711 (7.4487) grad_norm 2.0056 (2.3794) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:11:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][250/625] eta 0:03:40 lr 0.000683 wd 0.0500 time 0.5744 (0.5874) data time 0.0006 (0.0023) model time 0.5738 (0.5872) loss 7.7504 (7.4491) grad_norm 1.9515 (2.3586) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:12:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][260/625] eta 0:03:34 lr 0.000683 wd 0.0500 time 0.5721 (0.5872) data time 0.0006 (0.0022) model time 0.5715 (0.5870) loss 8.1889 (7.4607) grad_norm 2.1330 (2.3589) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:12:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][270/625] eta 0:03:28 lr 0.000683 wd 0.0500 time 0.5722 (0.5868) data time 0.0006 (0.0022) model time 0.5715 (0.5864) loss 6.7717 (7.4641) grad_norm 2.9794 (2.3541) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:12:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][280/625] eta 0:03:22 lr 0.000683 wd 0.0500 time 0.5746 (0.5864) data time 0.0006 (0.0021) model time 0.5740 (0.5859) loss 8.5453 (7.4629) grad_norm 1.9036 (2.3501) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:12:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][290/625] eta 0:03:16 lr 0.000683 wd 0.0500 time 0.5621 (0.5860) data time 0.0007 (0.0021) model time 0.5613 (0.5854) loss 5.7570 (7.4503) grad_norm 1.9096 (2.3374) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:12:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][300/625] eta 0:03:10 lr 0.000683 wd 0.0500 time 0.5697 (0.5856) data time 0.0008 (0.0021) model time 0.5690 (0.5850) loss 7.0183 (7.4583) grad_norm 2.2943 (2.3391) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:12:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][310/625] eta 0:03:04 lr 0.000682 wd 0.0500 time 0.5743 (0.5854) data time 0.0008 (0.0020) model time 0.5735 (0.5846) loss 5.9001 (7.4491) grad_norm 3.4547 (2.3373) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:12:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][320/625] eta 0:02:58 lr 0.000682 wd 0.0500 time 0.5720 (0.5850) data time 0.0008 (0.0020) model time 0.5712 (0.5842) loss 6.8588 (7.4475) grad_norm 2.0269 (2.3410) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:12:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][330/625] eta 0:02:52 lr 0.000682 wd 0.0500 time 0.5690 (0.5849) data time 0.0008 (0.0020) model time 0.5683 (0.5840) loss 7.7732 (7.4534) grad_norm 1.9528 (2.3384) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:12:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][340/625] eta 0:02:46 lr 0.000682 wd 0.0500 time 0.5715 (0.5846) data time 0.0009 (0.0020) model time 0.5707 (0.5836) loss 6.7684 (7.4631) grad_norm 1.7157 (2.3290) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:12:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][350/625] eta 0:02:40 lr 0.000682 wd 0.0500 time 0.5728 (0.5843) data time 0.0006 (0.0019) model time 0.5722 (0.5833) loss 7.8110 (7.4818) grad_norm 2.2874 (2.3190) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:12:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][360/625] eta 0:02:34 lr 0.000682 wd 0.0500 time 0.5731 (0.5840) data time 0.0006 (0.0019) model time 0.5725 (0.5830) loss 7.1820 (7.4778) grad_norm 2.9509 (2.3144) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:13:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][370/625] eta 0:02:28 lr 0.000682 wd 0.0500 time 0.5720 (0.5839) data time 0.0006 (0.0019) model time 0.5714 (0.5828) loss 6.5734 (7.4769) grad_norm 2.1331 (2.3151) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:13:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][380/625] eta 0:02:23 lr 0.000682 wd 0.0500 time 0.6085 (0.5842) data time 0.0006 (0.0018) model time 0.6079 (0.5832) loss 6.4686 (7.4701) grad_norm 1.9170 (2.3116) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:13:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][390/625] eta 0:02:17 lr 0.000682 wd 0.0500 time 0.7186 (0.5853) data time 0.0008 (0.0018) model time 0.7178 (0.5845) loss 5.8642 (7.4633) grad_norm 3.4709 (2.3136) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:13:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][400/625] eta 0:02:11 lr 0.000682 wd 0.0500 time 0.5673 (0.5863) data time 0.0008 (0.0018) model time 0.5665 (0.5856) loss 6.2901 (7.4588) grad_norm 2.1201 (2.3070) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:13:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][410/625] eta 0:02:06 lr 0.000681 wd 0.0500 time 0.5731 (0.5878) data time 0.0007 (0.0018) model time 0.5725 (0.5874) loss 8.0535 (7.4756) grad_norm 1.9609 (2.2998) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:13:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][420/625] eta 0:02:00 lr 0.000681 wd 0.0500 time 0.5730 (0.5880) data time 0.0008 (0.0018) model time 0.5722 (0.5876) loss 7.3531 (7.4746) grad_norm 1.9082 (2.2965) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:13:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][430/625] eta 0:01:54 lr 0.000681 wd 0.0500 time 0.5725 (0.5877) data time 0.0008 (0.0017) model time 0.5717 (0.5872) loss 7.6641 (7.4791) grad_norm 2.4240 (2.2921) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:13:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][440/625] eta 0:01:48 lr 0.000681 wd 0.0500 time 0.5770 (0.5874) data time 0.0008 (0.0017) model time 0.5762 (0.5868) loss 8.1321 (7.4726) grad_norm 2.2325 (2.2867) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:13:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][450/625] eta 0:01:42 lr 0.000681 wd 0.0500 time 0.5682 (0.5871) data time 0.0006 (0.0017) model time 0.5676 (0.5865) loss 7.7221 (7.4687) grad_norm 1.6938 (2.2852) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:13:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][460/625] eta 0:01:36 lr 0.000681 wd 0.0500 time 0.5752 (0.5869) data time 0.0006 (0.0017) model time 0.5746 (0.5862) loss 7.2448 (7.4703) grad_norm 1.8566 (2.2859) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:14:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][470/625] eta 0:01:30 lr 0.000681 wd 0.0500 time 0.5705 (0.5866) data time 0.0009 (0.0017) model time 0.5697 (0.5859) loss 8.2420 (7.4616) grad_norm 2.2140 (2.2842) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:14:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][480/625] eta 0:01:25 lr 0.000681 wd 0.0500 time 0.5744 (0.5865) data time 0.0008 (0.0016) model time 0.5736 (0.5858) loss 5.8108 (7.4539) grad_norm 1.8999 (2.2782) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:14:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][490/625] eta 0:01:19 lr 0.000681 wd 0.0500 time 0.5733 (0.5862) data time 0.0006 (0.0016) model time 0.5727 (0.5855) loss 6.2825 (7.4564) grad_norm 1.6176 (2.2728) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:14:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][500/625] eta 0:01:13 lr 0.000680 wd 0.0500 time 0.5711 (0.5860) data time 0.0006 (0.0016) model time 0.5705 (0.5853) loss 7.0536 (7.4592) grad_norm 1.7746 (2.2722) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:14:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][510/625] eta 0:01:07 lr 0.000680 wd 0.0500 time 0.5723 (0.5858) data time 0.0008 (0.0016) model time 0.5715 (0.5850) loss 8.8277 (7.4707) grad_norm 2.4160 (2.2861) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:14:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][520/625] eta 0:01:01 lr 0.000680 wd 0.0500 time 0.5708 (0.5856) data time 0.0006 (0.0016) model time 0.5701 (0.5848) loss 6.8218 (7.4718) grad_norm 2.8836 (2.2928) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:14:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][530/625] eta 0:00:55 lr 0.000680 wd 0.0500 time 0.5701 (0.5855) data time 0.0008 (0.0016) model time 0.5693 (0.5847) loss 8.6600 (7.4713) grad_norm 2.6338 (2.2991) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:14:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][540/625] eta 0:00:49 lr 0.000680 wd 0.0500 time 0.5738 (0.5853) data time 0.0008 (0.0016) model time 0.5730 (0.5845) loss 9.6740 (7.4728) grad_norm 2.2860 (2.3027) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:14:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][550/625] eta 0:00:43 lr 0.000680 wd 0.0500 time 0.5684 (0.5851) data time 0.0006 (0.0015) model time 0.5678 (0.5842) loss 6.7750 (7.4633) grad_norm 2.4618 (2.2995) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:14:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][560/625] eta 0:00:38 lr 0.000680 wd 0.0500 time 0.5757 (0.5849) data time 0.0008 (0.0015) model time 0.5749 (0.5840) loss 7.6562 (7.4604) grad_norm 1.6450 (2.3031) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:15:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][570/625] eta 0:00:32 lr 0.000680 wd 0.0500 time 0.5765 (0.5848) data time 0.0006 (0.0015) model time 0.5759 (0.5839) loss 8.6826 (7.4600) grad_norm 2.1459 (2.2998) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:15:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][580/625] eta 0:00:26 lr 0.000680 wd 0.0500 time 0.5739 (0.5846) data time 0.0006 (0.0015) model time 0.5732 (0.5837) loss 7.3036 (7.4602) grad_norm 1.7216 (2.2979) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:15:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][590/625] eta 0:00:20 lr 0.000679 wd 0.0500 time 0.5631 (0.5845) data time 0.0006 (0.0015) model time 0.5625 (0.5836) loss 6.8866 (7.4564) grad_norm 1.9911 (2.2953) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:15:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][600/625] eta 0:00:14 lr 0.000679 wd 0.0500 time 0.7345 (0.5847) data time 0.0008 (0.0015) model time 0.7337 (0.5839) loss 8.6341 (7.4610) grad_norm 2.4473 (2.2944) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:15:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][610/625] eta 0:00:08 lr 0.000679 wd 0.0500 time 0.7023 (0.5857) data time 0.0004 (0.0015) model time 0.7019 (0.5849) loss 8.5909 (7.4615) grad_norm 1.9990 (2.2963) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:15:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [148/300][620/625] eta 0:00:02 lr 0.000679 wd 0.0500 time 0.5718 (0.5862) data time 0.0004 (0.0015) model time 0.5714 (0.5854) loss 6.3095 (7.4574) grad_norm 2.7871 (2.2951) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:15:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 148 training takes 0:06:06 +[2024-07-25 05:15:33 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 05:15:35 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 05:15:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.679 (0.679) Loss 0.5029 (0.5029) Acc@1 89.209 (89.209) Acc@5 98.779 (98.779) Mem 22339MB +[2024-07-25 05:15:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.176) Loss 0.8218 (0.6426) Acc@1 81.396 (86.275) Acc@5 95.947 (97.678) Mem 22339MB +[2024-07-25 05:15:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.152) Loss 0.9414 (0.7527) Acc@1 77.002 (82.924) Acc@5 95.166 (96.545) Mem 22339MB +[2024-07-25 05:15:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.568 Acc@5 96.535 +[2024-07-25 05:15:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.6% +[2024-07-25 05:15:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.786 (0.786) Loss 0.4927 (0.4927) Acc@1 89.551 (89.551) Acc@5 98.828 (98.828) Mem 22339MB +[2024-07-25 05:15:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.186) Loss 0.7749 (0.6243) Acc@1 81.348 (86.594) Acc@5 96.289 (97.732) Mem 22339MB +[2024-07-25 05:15:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.157) Loss 0.9038 (0.7284) Acc@1 77.930 (83.471) Acc@5 95.410 (96.698) Mem 22339MB +[2024-07-25 05:15:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.121 Acc@5 96.709 +[2024-07-25 05:15:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.1% +[2024-07-25 05:15:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][0/625] eta 0:15:04 lr 0.000679 wd 0.0500 time 1.4470 (1.4470) data time 0.6051 (0.6051) model time 0.0000 (0.0000) loss 9.0844 (9.0844) grad_norm 2.5060 (2.5060) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:15:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][10/625] eta 0:07:34 lr 0.000679 wd 0.0500 time 0.7287 (0.7392) data time 0.0009 (0.0557) model time 0.0000 (0.0000) loss 8.5245 (7.7507) grad_norm 2.0471 (2.3067) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:15:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][20/625] eta 0:06:39 lr 0.000679 wd 0.0500 time 0.5747 (0.6603) data time 0.0006 (0.0296) model time 0.0000 (0.0000) loss 6.9828 (7.5121) grad_norm 5.4257 (2.6376) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:16:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][30/625] eta 0:06:16 lr 0.000679 wd 0.0500 time 0.5706 (0.6322) data time 0.0009 (0.0203) model time 0.0000 (0.0000) loss 6.0455 (7.4536) grad_norm 3.0706 (2.6125) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:16:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][40/625] eta 0:06:04 lr 0.000679 wd 0.0500 time 0.7798 (0.6230) data time 0.0008 (0.0155) model time 0.0000 (0.0000) loss 6.3056 (7.4575) grad_norm 1.6779 (2.5222) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:16:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][50/625] eta 0:05:52 lr 0.000679 wd 0.0500 time 0.5710 (0.6122) data time 0.0006 (0.0126) model time 0.0000 (0.0000) loss 6.3854 (7.3879) grad_norm 4.2416 (2.4904) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:16:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][60/625] eta 0:05:42 lr 0.000678 wd 0.0500 time 0.5729 (0.6061) data time 0.0006 (0.0107) model time 0.5723 (0.5744) loss 8.1639 (7.5117) grad_norm 2.4786 (2.4821) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:16:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][70/625] eta 0:05:33 lr 0.000678 wd 0.0500 time 0.5761 (0.6015) data time 0.0008 (0.0093) model time 0.5753 (0.5735) loss 7.6786 (7.4769) grad_norm 2.2667 (2.5331) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:16:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][80/625] eta 0:05:25 lr 0.000678 wd 0.0500 time 0.5735 (0.5981) data time 0.0008 (0.0082) model time 0.5726 (0.5733) loss 6.9252 (7.4558) grad_norm 1.7435 (2.5322) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:16:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][90/625] eta 0:05:18 lr 0.000678 wd 0.0500 time 0.5725 (0.5954) data time 0.0008 (0.0074) model time 0.5717 (0.5732) loss 8.7678 (7.5230) grad_norm 1.8395 (2.4954) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:16:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][100/625] eta 0:05:11 lr 0.000678 wd 0.0500 time 0.5736 (0.5933) data time 0.0008 (0.0068) model time 0.5728 (0.5732) loss 7.9557 (7.5078) grad_norm 1.7092 (2.4502) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:16:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][110/625] eta 0:05:04 lr 0.000678 wd 0.0500 time 0.5778 (0.5916) data time 0.0006 (0.0062) model time 0.5771 (0.5733) loss 7.9050 (7.5565) grad_norm 2.4110 (2.4589) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:16:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][120/625] eta 0:04:57 lr 0.000678 wd 0.0500 time 0.5749 (0.5900) data time 0.0008 (0.0058) model time 0.5741 (0.5731) loss 8.1729 (7.5864) grad_norm 2.1668 (2.4740) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:16:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][130/625] eta 0:04:51 lr 0.000678 wd 0.0500 time 0.5741 (0.5889) data time 0.0006 (0.0054) model time 0.5735 (0.5732) loss 6.5146 (7.5749) grad_norm 2.1249 (2.4902) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:17:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][140/625] eta 0:04:45 lr 0.000678 wd 0.0500 time 0.5856 (0.5879) data time 0.0009 (0.0051) model time 0.5847 (0.5733) loss 6.3147 (7.5592) grad_norm 3.9751 (2.5404) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:17:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][150/625] eta 0:04:38 lr 0.000678 wd 0.0500 time 0.5755 (0.5869) data time 0.0006 (0.0048) model time 0.5749 (0.5733) loss 6.3990 (7.5404) grad_norm 2.3497 (2.5712) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:17:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][160/625] eta 0:04:32 lr 0.000677 wd 0.0500 time 0.6004 (0.5863) data time 0.0006 (0.0045) model time 0.5997 (0.5735) loss 6.8099 (7.5450) grad_norm 2.1700 (2.6375) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:17:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][170/625] eta 0:04:26 lr 0.000677 wd 0.0500 time 0.5792 (0.5857) data time 0.0006 (0.0043) model time 0.5786 (0.5737) loss 6.8322 (7.5329) grad_norm 1.8347 (2.6271) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:17:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][180/625] eta 0:04:20 lr 0.000677 wd 0.0500 time 0.5759 (0.5852) data time 0.0008 (0.0041) model time 0.5751 (0.5738) loss 8.0208 (7.5290) grad_norm 2.6850 (2.6180) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:17:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][190/625] eta 0:04:14 lr 0.000677 wd 0.0500 time 0.5760 (0.5854) data time 0.0007 (0.0040) model time 0.5753 (0.5748) loss 8.4290 (7.5233) grad_norm 2.2586 (2.5971) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:17:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][200/625] eta 0:04:09 lr 0.000677 wd 0.0500 time 0.5751 (0.5872) data time 0.0008 (0.0038) model time 0.5743 (0.5779) loss 8.3280 (7.5229) grad_norm 2.7652 (2.5802) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:17:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][210/625] eta 0:04:04 lr 0.000677 wd 0.0500 time 0.7346 (0.5892) data time 0.0006 (0.0037) model time 0.7340 (0.5810) loss 6.3828 (7.5104) grad_norm 1.8564 (2.5708) loss_scale 2048.0000 (1038.5592) mem 22339MB +[2024-07-25 05:17:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][220/625] eta 0:03:59 lr 0.000677 wd 0.0500 time 0.5714 (0.5924) data time 0.0006 (0.0035) model time 0.5709 (0.5856) loss 8.4470 (7.4926) grad_norm 2.6579 (2.5554) loss_scale 2048.0000 (1084.2353) mem 22339MB +[2024-07-25 05:18:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][230/625] eta 0:03:54 lr 0.000677 wd 0.0500 time 0.5782 (0.5939) data time 0.0006 (0.0034) model time 0.5777 (0.5879) loss 6.7362 (7.4762) grad_norm 1.9862 (2.5403) loss_scale 2048.0000 (1125.9567) mem 22339MB +[2024-07-25 05:18:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][240/625] eta 0:03:48 lr 0.000677 wd 0.0500 time 0.5834 (0.5935) data time 0.0006 (0.0033) model time 0.5828 (0.5876) loss 7.5359 (7.4748) grad_norm 4.1835 (2.5496) loss_scale 2048.0000 (1164.2158) mem 22339MB +[2024-07-25 05:18:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][250/625] eta 0:03:42 lr 0.000676 wd 0.0500 time 0.5752 (0.5927) data time 0.0006 (0.0032) model time 0.5746 (0.5870) loss 5.9441 (7.4624) grad_norm 3.5759 (2.5855) loss_scale 2048.0000 (1199.4263) mem 22339MB +[2024-07-25 05:18:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][260/625] eta 0:03:36 lr 0.000676 wd 0.0500 time 0.5215 (0.5923) data time 0.0006 (0.0031) model time 0.5209 (0.5867) loss 6.1390 (7.4626) grad_norm 2.1154 (2.5776) loss_scale 2048.0000 (1231.9387) mem 22339MB +[2024-07-25 05:18:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][270/625] eta 0:03:30 lr 0.000676 wd 0.0500 time 0.5762 (0.5916) data time 0.0006 (0.0030) model time 0.5755 (0.5861) loss 9.0264 (7.4676) grad_norm 2.0235 (2.5672) loss_scale 2048.0000 (1262.0517) mem 22339MB +[2024-07-25 05:18:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][280/625] eta 0:03:23 lr 0.000676 wd 0.0500 time 0.5775 (0.5910) data time 0.0008 (0.0029) model time 0.5767 (0.5855) loss 8.4682 (7.4698) grad_norm 1.6367 (2.5846) loss_scale 2048.0000 (1290.0214) mem 22339MB +[2024-07-25 05:18:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][290/625] eta 0:03:17 lr 0.000676 wd 0.0500 time 0.5743 (0.5904) data time 0.0008 (0.0029) model time 0.5735 (0.5850) loss 8.2046 (7.4598) grad_norm 2.8319 (2.6000) loss_scale 2048.0000 (1316.0687) mem 22339MB +[2024-07-25 05:18:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][300/625] eta 0:03:11 lr 0.000676 wd 0.0500 time 0.5740 (0.5899) data time 0.0006 (0.0028) model time 0.5734 (0.5846) loss 8.2592 (7.4767) grad_norm 2.1669 (2.5937) loss_scale 2048.0000 (1340.3854) mem 22339MB +[2024-07-25 05:18:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][310/625] eta 0:03:05 lr 0.000676 wd 0.0500 time 0.5750 (0.5894) data time 0.0006 (0.0027) model time 0.5744 (0.5841) loss 6.3980 (7.4732) grad_norm 1.9880 (2.5776) loss_scale 2048.0000 (1363.1383) mem 22339MB +[2024-07-25 05:18:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][320/625] eta 0:02:59 lr 0.000676 wd 0.0500 time 0.5769 (0.5890) data time 0.0007 (0.0027) model time 0.5762 (0.5838) loss 8.5820 (7.4669) grad_norm 2.0938 (2.5701) loss_scale 2048.0000 (1384.4735) mem 22339MB +[2024-07-25 05:18:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][330/625] eta 0:02:53 lr 0.000676 wd 0.0500 time 0.5904 (0.5886) data time 0.0006 (0.0026) model time 0.5898 (0.5835) loss 6.0894 (7.4577) grad_norm 3.2024 (2.5555) loss_scale 2048.0000 (1404.5196) mem 22339MB +[2024-07-25 05:19:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][340/625] eta 0:02:47 lr 0.000676 wd 0.0500 time 0.5760 (0.5882) data time 0.0008 (0.0026) model time 0.5752 (0.5832) loss 8.4069 (7.4529) grad_norm 1.5574 (2.5496) loss_scale 2048.0000 (1423.3900) mem 22339MB +[2024-07-25 05:19:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][350/625] eta 0:02:41 lr 0.000675 wd 0.0500 time 0.5771 (0.5879) data time 0.0008 (0.0025) model time 0.5763 (0.5829) loss 7.3612 (7.4514) grad_norm 2.1498 (2.5441) loss_scale 2048.0000 (1441.1852) mem 22339MB +[2024-07-25 05:19:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][360/625] eta 0:02:35 lr 0.000675 wd 0.0500 time 0.5784 (0.5875) data time 0.0008 (0.0025) model time 0.5776 (0.5827) loss 8.4866 (7.4520) grad_norm 1.9267 (2.5274) loss_scale 2048.0000 (1457.9945) mem 22339MB +[2024-07-25 05:19:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][370/625] eta 0:02:29 lr 0.000675 wd 0.0500 time 0.5947 (0.5874) data time 0.0008 (0.0024) model time 0.5939 (0.5826) loss 6.7586 (7.4465) grad_norm 2.3233 (2.5320) loss_scale 2048.0000 (1473.8976) mem 22339MB +[2024-07-25 05:19:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][380/625] eta 0:02:23 lr 0.000675 wd 0.0500 time 0.5773 (0.5871) data time 0.0008 (0.0024) model time 0.5765 (0.5824) loss 6.6844 (7.4343) grad_norm 1.8577 (2.5205) loss_scale 2048.0000 (1488.9659) mem 22339MB +[2024-07-25 05:19:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][390/625] eta 0:02:17 lr 0.000675 wd 0.0500 time 0.5775 (0.5868) data time 0.0008 (0.0023) model time 0.5768 (0.5822) loss 7.3264 (7.4314) grad_norm 3.1135 (2.5140) loss_scale 2048.0000 (1503.2634) mem 22339MB +[2024-07-25 05:19:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][400/625] eta 0:02:11 lr 0.000675 wd 0.0500 time 0.5733 (0.5865) data time 0.0006 (0.0023) model time 0.5727 (0.5820) loss 5.9206 (7.4336) grad_norm 1.9469 (2.5175) loss_scale 2048.0000 (1516.8479) mem 22339MB +[2024-07-25 05:19:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][410/625] eta 0:02:06 lr 0.000675 wd 0.0500 time 0.5755 (0.5865) data time 0.0009 (0.0023) model time 0.5746 (0.5821) loss 9.8698 (7.4542) grad_norm 2.2706 (2.5224) loss_scale 2048.0000 (1529.7713) mem 22339MB +[2024-07-25 05:19:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][420/625] eta 0:02:00 lr 0.000675 wd 0.0500 time 0.5762 (0.5869) data time 0.0008 (0.0022) model time 0.5755 (0.5826) loss 6.2713 (7.4495) grad_norm 1.8979 (2.5156) loss_scale 2048.0000 (1542.0808) mem 22339MB +[2024-07-25 05:19:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][430/625] eta 0:01:54 lr 0.000675 wd 0.0500 time 0.7024 (0.5878) data time 0.0008 (0.0022) model time 0.7016 (0.5837) loss 7.9566 (7.4474) grad_norm 1.8606 (2.5053) loss_scale 2048.0000 (1553.8190) mem 22339MB +[2024-07-25 05:20:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][440/625] eta 0:01:48 lr 0.000674 wd 0.0500 time 0.6069 (0.5886) data time 0.0006 (0.0022) model time 0.6063 (0.5847) loss 7.1431 (7.4523) grad_norm 2.8167 (2.5170) loss_scale 2048.0000 (1565.0249) mem 22339MB +[2024-07-25 05:20:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][450/625] eta 0:01:43 lr 0.000674 wd 0.0500 time 0.5871 (0.5895) data time 0.0007 (0.0022) model time 0.5863 (0.5858) loss 7.3044 (7.4591) grad_norm 2.9546 (2.5284) loss_scale 2048.0000 (1575.7339) mem 22339MB +[2024-07-25 05:20:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][460/625] eta 0:01:37 lr 0.000674 wd 0.0500 time 0.5748 (0.5895) data time 0.0006 (0.0021) model time 0.5742 (0.5859) loss 6.2467 (7.4666) grad_norm 3.4020 (2.5370) loss_scale 2048.0000 (1585.9783) mem 22339MB +[2024-07-25 05:20:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][470/625] eta 0:01:31 lr 0.000674 wd 0.0500 time 0.5800 (0.5893) data time 0.0008 (0.0021) model time 0.5793 (0.5856) loss 8.6985 (7.4661) grad_norm 1.6755 (2.5329) loss_scale 2048.0000 (1595.7877) mem 22339MB +[2024-07-25 05:20:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][480/625] eta 0:01:25 lr 0.000674 wd 0.0500 time 0.7088 (0.5893) data time 0.0008 (0.0021) model time 0.7080 (0.5858) loss 9.0048 (7.4683) grad_norm 2.1180 (2.5230) loss_scale 2048.0000 (1605.1892) mem 22339MB +[2024-07-25 05:20:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][490/625] eta 0:01:19 lr 0.000674 wd 0.0500 time 0.5717 (0.5890) data time 0.0008 (0.0021) model time 0.5709 (0.5855) loss 6.8424 (7.4724) grad_norm 2.6189 (2.5142) loss_scale 2048.0000 (1614.2077) mem 22339MB +[2024-07-25 05:20:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][500/625] eta 0:01:13 lr 0.000674 wd 0.0500 time 0.5670 (0.5887) data time 0.0008 (0.0020) model time 0.5662 (0.5853) loss 8.0865 (7.4664) grad_norm 3.0133 (2.5163) loss_scale 2048.0000 (1622.8663) mem 22339MB +[2024-07-25 05:20:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][510/625] eta 0:01:07 lr 0.000674 wd 0.0500 time 0.5757 (0.5885) data time 0.0006 (0.0020) model time 0.5751 (0.5850) loss 7.5126 (7.4639) grad_norm 2.2854 (2.5378) loss_scale 2048.0000 (1631.1859) mem 22339MB +[2024-07-25 05:20:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][520/625] eta 0:01:01 lr 0.000674 wd 0.0500 time 0.5743 (0.5882) data time 0.0006 (0.0020) model time 0.5736 (0.5848) loss 7.3702 (7.4676) grad_norm 2.2553 (2.5364) loss_scale 2048.0000 (1639.1862) mem 22339MB +[2024-07-25 05:20:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][530/625] eta 0:00:55 lr 0.000674 wd 0.0500 time 0.5754 (0.5880) data time 0.0006 (0.0020) model time 0.5748 (0.5846) loss 6.9660 (7.4644) grad_norm 4.2817 (2.5288) loss_scale 2048.0000 (1646.8851) mem 22339MB +[2024-07-25 05:21:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][540/625] eta 0:00:49 lr 0.000673 wd 0.0500 time 0.5726 (0.5877) data time 0.0009 (0.0019) model time 0.5717 (0.5844) loss 7.0386 (7.4645) grad_norm 2.5937 (2.5259) loss_scale 2048.0000 (1654.2994) mem 22339MB +[2024-07-25 05:21:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][550/625] eta 0:00:44 lr 0.000673 wd 0.0500 time 0.5739 (0.5875) data time 0.0008 (0.0019) model time 0.5731 (0.5842) loss 7.5218 (7.4681) grad_norm 3.3174 (2.5319) loss_scale 2048.0000 (1661.4446) mem 22339MB +[2024-07-25 05:21:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][560/625] eta 0:00:38 lr 0.000673 wd 0.0500 time 0.5759 (0.5873) data time 0.0006 (0.0019) model time 0.5753 (0.5840) loss 6.9046 (7.4621) grad_norm 1.6611 (2.5327) loss_scale 2048.0000 (1668.3351) mem 22339MB +[2024-07-25 05:21:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][570/625] eta 0:00:32 lr 0.000673 wd 0.0500 time 0.5767 (0.5871) data time 0.0008 (0.0019) model time 0.5759 (0.5838) loss 5.6448 (7.4586) grad_norm 1.8718 (2.5284) loss_scale 2048.0000 (1674.9842) mem 22339MB +[2024-07-25 05:21:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][580/625] eta 0:00:26 lr 0.000673 wd 0.0500 time 0.5773 (0.5869) data time 0.0006 (0.0019) model time 0.5767 (0.5836) loss 5.9695 (7.4571) grad_norm 1.6366 (2.5256) loss_scale 2048.0000 (1681.4045) mem 22339MB +[2024-07-25 05:21:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][590/625] eta 0:00:20 lr 0.000673 wd 0.0500 time 0.5754 (0.5867) data time 0.0008 (0.0018) model time 0.5746 (0.5835) loss 7.8647 (7.4632) grad_norm 2.9582 (2.5312) loss_scale 2048.0000 (1687.6074) mem 22339MB +[2024-07-25 05:21:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][600/625] eta 0:00:14 lr 0.000673 wd 0.0500 time 0.5761 (0.5865) data time 0.0006 (0.0018) model time 0.5755 (0.5833) loss 7.6054 (7.4617) grad_norm 2.6101 (2.5444) loss_scale 2048.0000 (1693.6040) mem 22339MB +[2024-07-25 05:21:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][610/625] eta 0:00:08 lr 0.000673 wd 0.0500 time 0.5775 (0.5863) data time 0.0004 (0.0018) model time 0.5772 (0.5831) loss 6.7514 (7.4602) grad_norm 2.3209 (2.5445) loss_scale 2048.0000 (1699.4043) mem 22339MB +[2024-07-25 05:21:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [149/300][620/625] eta 0:00:02 lr 0.000673 wd 0.0500 time 0.5765 (0.5861) data time 0.0005 (0.0018) model time 0.5759 (0.5830) loss 8.6430 (7.4533) grad_norm 1.5559 (2.5394) loss_scale 2048.0000 (1705.0177) mem 22339MB +[2024-07-25 05:21:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 149 training takes 0:06:06 +[2024-07-25 05:21:49 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 05:21:50 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 05:21:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.487 (0.487) Loss 0.5249 (0.5249) Acc@1 88.965 (88.965) Acc@5 98.779 (98.779) Mem 22339MB +[2024-07-25 05:21:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.8198 (0.6426) Acc@1 80.615 (86.088) Acc@5 96.191 (97.683) Mem 22339MB +[2024-07-25 05:21:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.9429 (0.7541) Acc@1 77.246 (82.971) Acc@5 94.775 (96.540) Mem 22339MB +[2024-07-25 05:21:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.626 Acc@5 96.505 +[2024-07-25 05:21:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.6% +[2024-07-25 05:21:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 82.63% +[2024-07-25 05:21:54 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 05:21:55 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 05:21:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.471 (0.471) Loss 0.4934 (0.4934) Acc@1 89.648 (89.648) Acc@5 98.828 (98.828) Mem 22339MB +[2024-07-25 05:21:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.7744 (0.6241) Acc@1 81.396 (86.634) Acc@5 96.338 (97.741) Mem 22339MB +[2024-07-25 05:21:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9023 (0.7278) Acc@1 77.979 (83.487) Acc@5 95.410 (96.710) Mem 22339MB +[2024-07-25 05:21:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.137 Acc@5 96.719 +[2024-07-25 05:21:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.1% +[2024-07-25 05:21:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.14% +[2024-07-25 05:21:59 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 05:22:00 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 05:22:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][0/625] eta 0:09:11 lr 0.000673 wd 0.0500 time 0.8819 (0.8819) data time 0.3620 (0.3620) model time 0.0000 (0.0000) loss 7.7873 (7.7873) grad_norm 2.3666 (2.3666) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 05:22:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][10/625] eta 0:06:19 lr 0.000672 wd 0.0500 time 0.5750 (0.6176) data time 0.0006 (0.0336) model time 0.0000 (0.0000) loss 8.7021 (7.5283) grad_norm 2.1723 (2.5290) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 05:22:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][20/625] eta 0:06:19 lr 0.000672 wd 0.0500 time 0.7346 (0.6276) data time 0.0008 (0.0179) model time 0.0000 (0.0000) loss 7.6424 (7.2740) grad_norm 2.3207 (2.3520) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 05:22:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][30/625] eta 0:06:17 lr 0.000672 wd 0.0500 time 0.7473 (0.6351) data time 0.0006 (0.0124) model time 0.0000 (0.0000) loss 8.5601 (7.3883) grad_norm 1.5920 (2.2599) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 05:22:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][40/625] eta 0:06:10 lr 0.000672 wd 0.0500 time 0.7692 (0.6325) data time 0.0007 (0.0096) model time 0.0000 (0.0000) loss 8.9965 (7.4411) grad_norm 1.8510 (2.2011) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 05:22:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][50/625] eta 0:05:57 lr 0.000672 wd 0.0500 time 0.5661 (0.6219) data time 0.0008 (0.0078) model time 0.0000 (0.0000) loss 8.4509 (7.4388) grad_norm 2.8583 (2.3310) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 05:22:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][60/625] eta 0:05:47 lr 0.000672 wd 0.0500 time 0.5720 (0.6142) data time 0.0007 (0.0067) model time 0.5713 (0.5742) loss 7.0914 (7.4177) grad_norm 2.4888 (2.3435) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 05:22:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][70/625] eta 0:05:37 lr 0.000672 wd 0.0500 time 0.5696 (0.6087) data time 0.0008 (0.0059) model time 0.5688 (0.5742) loss 6.2142 (7.4589) grad_norm 2.7330 (2.3682) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 05:22:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][80/625] eta 0:05:29 lr 0.000672 wd 0.0500 time 0.5697 (0.6041) data time 0.0006 (0.0052) model time 0.5691 (0.5731) loss 7.6732 (7.4741) grad_norm 2.2345 (inf) loss_scale 1024.0000 (2010.0741) mem 22339MB +[2024-07-25 05:22:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][90/625] eta 0:05:22 lr 0.000672 wd 0.0500 time 0.5613 (0.6020) data time 0.0007 (0.0048) model time 0.5606 (0.5758) loss 8.1039 (7.5057) grad_norm 1.5604 (inf) loss_scale 1024.0000 (1901.7143) mem 22339MB +[2024-07-25 05:23:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][100/625] eta 0:05:14 lr 0.000671 wd 0.0500 time 0.5634 (0.5992) data time 0.0006 (0.0044) model time 0.5628 (0.5753) loss 6.7796 (7.4501) grad_norm 1.8558 (inf) loss_scale 1024.0000 (1814.8119) mem 22339MB +[2024-07-25 05:23:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][110/625] eta 0:05:07 lr 0.000671 wd 0.0500 time 0.5753 (0.5973) data time 0.0008 (0.0041) model time 0.5745 (0.5755) loss 7.9698 (7.4651) grad_norm 2.5606 (inf) loss_scale 1024.0000 (1743.5676) mem 22339MB +[2024-07-25 05:23:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][120/625] eta 0:05:01 lr 0.000671 wd 0.0500 time 0.5708 (0.5965) data time 0.0006 (0.0038) model time 0.5702 (0.5771) loss 6.9029 (7.4359) grad_norm 1.6947 (inf) loss_scale 1024.0000 (1684.0992) mem 22339MB +[2024-07-25 05:23:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][130/625] eta 0:04:54 lr 0.000671 wd 0.0500 time 0.5727 (0.5949) data time 0.0006 (0.0037) model time 0.5721 (0.5768) loss 6.2786 (7.4209) grad_norm 2.0946 (inf) loss_scale 1024.0000 (1633.7099) mem 22339MB +[2024-07-25 05:23:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][140/625] eta 0:04:47 lr 0.000671 wd 0.0500 time 0.5749 (0.5936) data time 0.0009 (0.0035) model time 0.5741 (0.5767) loss 7.6334 (7.4445) grad_norm 3.5458 (inf) loss_scale 1024.0000 (1590.4681) mem 22339MB +[2024-07-25 05:23:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][150/625] eta 0:04:41 lr 0.000671 wd 0.0500 time 0.5730 (0.5927) data time 0.0006 (0.0033) model time 0.5724 (0.5769) loss 6.5580 (7.4658) grad_norm 1.6500 (inf) loss_scale 1024.0000 (1552.9536) mem 22339MB +[2024-07-25 05:23:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][160/625] eta 0:04:35 lr 0.000671 wd 0.0500 time 0.5713 (0.5917) data time 0.0006 (0.0031) model time 0.5707 (0.5768) loss 7.8238 (7.4800) grad_norm 2.4566 (inf) loss_scale 1024.0000 (1520.0994) mem 22339MB +[2024-07-25 05:23:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][170/625] eta 0:04:28 lr 0.000671 wd 0.0500 time 0.5727 (0.5907) data time 0.0008 (0.0030) model time 0.5719 (0.5765) loss 8.4469 (7.4806) grad_norm 1.7313 (inf) loss_scale 1024.0000 (1491.0877) mem 22339MB +[2024-07-25 05:23:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][180/625] eta 0:04:22 lr 0.000671 wd 0.0500 time 0.5751 (0.5899) data time 0.0008 (0.0029) model time 0.5743 (0.5763) loss 6.9011 (7.4637) grad_norm 2.2533 (inf) loss_scale 1024.0000 (1465.2818) mem 22339MB +[2024-07-25 05:23:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][190/625] eta 0:04:16 lr 0.000670 wd 0.0500 time 0.5728 (0.5891) data time 0.0007 (0.0028) model time 0.5722 (0.5762) loss 6.9175 (7.4725) grad_norm 2.6370 (inf) loss_scale 1024.0000 (1442.1780) mem 22339MB +[2024-07-25 05:23:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][200/625] eta 0:04:10 lr 0.000670 wd 0.0500 time 0.5642 (0.5885) data time 0.0008 (0.0027) model time 0.5634 (0.5763) loss 8.2380 (7.4826) grad_norm 1.5142 (inf) loss_scale 1024.0000 (1421.3731) mem 22339MB +[2024-07-25 05:24:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][210/625] eta 0:04:04 lr 0.000670 wd 0.0500 time 0.5745 (0.5883) data time 0.0006 (0.0026) model time 0.5739 (0.5767) loss 6.7197 (7.4775) grad_norm 2.2252 (inf) loss_scale 1024.0000 (1402.5403) mem 22339MB +[2024-07-25 05:24:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][220/625] eta 0:03:58 lr 0.000670 wd 0.0500 time 0.5734 (0.5877) data time 0.0006 (0.0025) model time 0.5727 (0.5765) loss 6.7649 (7.4968) grad_norm 1.7416 (inf) loss_scale 1024.0000 (1385.4118) mem 22339MB +[2024-07-25 05:24:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][230/625] eta 0:03:52 lr 0.000670 wd 0.0500 time 0.5695 (0.5877) data time 0.0006 (0.0025) model time 0.5689 (0.5771) loss 8.1895 (7.4951) grad_norm 1.9792 (inf) loss_scale 1024.0000 (1369.7662) mem 22339MB +[2024-07-25 05:24:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][240/625] eta 0:03:47 lr 0.000670 wd 0.0500 time 0.5705 (0.5898) data time 0.0006 (0.0024) model time 0.5699 (0.5802) loss 6.4169 (7.4791) grad_norm 2.4457 (inf) loss_scale 1024.0000 (1355.4191) mem 22339MB +[2024-07-25 05:24:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][250/625] eta 0:03:42 lr 0.000670 wd 0.0500 time 0.5683 (0.5920) data time 0.0006 (0.0023) model time 0.5676 (0.5835) loss 6.5591 (7.4559) grad_norm 1.5802 (inf) loss_scale 1024.0000 (1342.2151) mem 22339MB +[2024-07-25 05:24:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][260/625] eta 0:03:37 lr 0.000670 wd 0.0500 time 0.7240 (0.5945) data time 0.0008 (0.0023) model time 0.7232 (0.5870) loss 6.1554 (7.4539) grad_norm 2.2168 (inf) loss_scale 1024.0000 (1330.0230) mem 22339MB +[2024-07-25 05:24:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][270/625] eta 0:03:30 lr 0.000670 wd 0.0500 time 0.5655 (0.5942) data time 0.0008 (0.0022) model time 0.5647 (0.5869) loss 6.9548 (7.4548) grad_norm 2.4178 (inf) loss_scale 1024.0000 (1318.7306) mem 22339MB +[2024-07-25 05:24:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][280/625] eta 0:03:24 lr 0.000670 wd 0.0500 time 0.5741 (0.5935) data time 0.0006 (0.0022) model time 0.5736 (0.5863) loss 6.0800 (7.4564) grad_norm 1.9148 (inf) loss_scale 1024.0000 (1308.2420) mem 22339MB +[2024-07-25 05:24:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][290/625] eta 0:03:18 lr 0.000669 wd 0.0500 time 0.5675 (0.5928) data time 0.0006 (0.0021) model time 0.5668 (0.5858) loss 7.6541 (7.4445) grad_norm 2.0060 (inf) loss_scale 1024.0000 (1298.4742) mem 22339MB +[2024-07-25 05:24:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][300/625] eta 0:03:12 lr 0.000669 wd 0.0500 time 0.5716 (0.5922) data time 0.0006 (0.0021) model time 0.5710 (0.5853) loss 6.9716 (7.4508) grad_norm 2.3872 (inf) loss_scale 1024.0000 (1289.3555) mem 22339MB +[2024-07-25 05:25:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][310/625] eta 0:03:06 lr 0.000669 wd 0.0500 time 0.5717 (0.5916) data time 0.0007 (0.0020) model time 0.5710 (0.5848) loss 7.3097 (7.4287) grad_norm 2.1228 (inf) loss_scale 1024.0000 (1280.8232) mem 22339MB +[2024-07-25 05:25:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][320/625] eta 0:03:00 lr 0.000669 wd 0.0500 time 0.5725 (0.5911) data time 0.0006 (0.0020) model time 0.5719 (0.5844) loss 7.1505 (7.4317) grad_norm 5.4659 (inf) loss_scale 1024.0000 (1272.8224) mem 22339MB +[2024-07-25 05:25:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][330/625] eta 0:02:54 lr 0.000669 wd 0.0500 time 0.5741 (0.5905) data time 0.0008 (0.0020) model time 0.5733 (0.5839) loss 7.5930 (7.4301) grad_norm 1.6542 (inf) loss_scale 1024.0000 (1265.3051) mem 22339MB +[2024-07-25 05:25:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][340/625] eta 0:02:48 lr 0.000669 wd 0.0500 time 0.5743 (0.5901) data time 0.0008 (0.0019) model time 0.5735 (0.5837) loss 5.6450 (7.4204) grad_norm 2.2614 (inf) loss_scale 1024.0000 (1258.2287) mem 22339MB +[2024-07-25 05:25:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][350/625] eta 0:02:42 lr 0.000669 wd 0.0500 time 0.5690 (0.5897) data time 0.0008 (0.0019) model time 0.5682 (0.5833) loss 6.2402 (7.4191) grad_norm 2.0815 (inf) loss_scale 1024.0000 (1251.5556) mem 22339MB +[2024-07-25 05:25:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][360/625] eta 0:02:36 lr 0.000669 wd 0.0500 time 0.5718 (0.5892) data time 0.0006 (0.0019) model time 0.5711 (0.5830) loss 6.7274 (7.4349) grad_norm 2.0293 (inf) loss_scale 1024.0000 (1245.2521) mem 22339MB +[2024-07-25 05:25:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][370/625] eta 0:02:30 lr 0.000669 wd 0.0500 time 0.5751 (0.5888) data time 0.0007 (0.0018) model time 0.5744 (0.5827) loss 8.0974 (7.4346) grad_norm 1.8944 (inf) loss_scale 1024.0000 (1239.2884) mem 22339MB +[2024-07-25 05:25:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][380/625] eta 0:02:24 lr 0.000668 wd 0.0500 time 0.5734 (0.5886) data time 0.0008 (0.0018) model time 0.5726 (0.5825) loss 8.0455 (7.4404) grad_norm 1.4673 (inf) loss_scale 1024.0000 (1233.6378) mem 22339MB +[2024-07-25 05:25:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][390/625] eta 0:02:18 lr 0.000668 wd 0.0500 time 0.5616 (0.5883) data time 0.0006 (0.0018) model time 0.5610 (0.5824) loss 7.4914 (7.4492) grad_norm 1.5308 (inf) loss_scale 1024.0000 (1228.2762) mem 22339MB +[2024-07-25 05:25:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][400/625] eta 0:02:12 lr 0.000668 wd 0.0500 time 0.5671 (0.5880) data time 0.0008 (0.0018) model time 0.5663 (0.5821) loss 5.6554 (7.4377) grad_norm 2.1104 (inf) loss_scale 1024.0000 (1223.1820) mem 22339MB +[2024-07-25 05:26:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][410/625] eta 0:02:06 lr 0.000668 wd 0.0500 time 0.5641 (0.5876) data time 0.0007 (0.0017) model time 0.5635 (0.5819) loss 6.2110 (7.4393) grad_norm 1.9946 (inf) loss_scale 1024.0000 (1218.3358) mem 22339MB +[2024-07-25 05:26:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][420/625] eta 0:02:00 lr 0.000668 wd 0.0500 time 0.5716 (0.5873) data time 0.0008 (0.0017) model time 0.5708 (0.5817) loss 8.6966 (7.4400) grad_norm 2.2808 (inf) loss_scale 1024.0000 (1213.7197) mem 22339MB +[2024-07-25 05:26:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][430/625] eta 0:01:54 lr 0.000668 wd 0.0500 time 0.5663 (0.5874) data time 0.0006 (0.0017) model time 0.5657 (0.5819) loss 7.2772 (7.4411) grad_norm 1.4743 (inf) loss_scale 1024.0000 (1209.3179) mem 22339MB +[2024-07-25 05:26:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][440/625] eta 0:01:48 lr 0.000668 wd 0.0500 time 0.5753 (0.5871) data time 0.0006 (0.0017) model time 0.5747 (0.5817) loss 6.1650 (7.4282) grad_norm 1.8945 (inf) loss_scale 1024.0000 (1205.1156) mem 22339MB +[2024-07-25 05:26:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][450/625] eta 0:01:42 lr 0.000668 wd 0.0500 time 0.5725 (0.5872) data time 0.0009 (0.0016) model time 0.5717 (0.5819) loss 9.5021 (7.4205) grad_norm 4.7884 (inf) loss_scale 1024.0000 (1201.0998) mem 22339MB +[2024-07-25 05:26:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][460/625] eta 0:01:36 lr 0.000668 wd 0.0500 time 0.5669 (0.5878) data time 0.0006 (0.0016) model time 0.5663 (0.5827) loss 7.6350 (7.4229) grad_norm 2.7519 (inf) loss_scale 1024.0000 (1197.2581) mem 22339MB +[2024-07-25 05:26:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][470/625] eta 0:01:31 lr 0.000668 wd 0.0500 time 0.5752 (0.5889) data time 0.0006 (0.0016) model time 0.5746 (0.5840) loss 7.3152 (7.4228) grad_norm 2.7646 (inf) loss_scale 1024.0000 (1193.5796) mem 22339MB +[2024-07-25 05:26:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][480/625] eta 0:01:25 lr 0.000667 wd 0.0500 time 0.5713 (0.5899) data time 0.0009 (0.0016) model time 0.5705 (0.5852) loss 6.5316 (7.4223) grad_norm 1.7852 (inf) loss_scale 1024.0000 (1190.0541) mem 22339MB +[2024-07-25 05:26:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][490/625] eta 0:01:19 lr 0.000667 wd 0.0500 time 0.7218 (0.5905) data time 0.0008 (0.0016) model time 0.7211 (0.5860) loss 7.8553 (7.4212) grad_norm 2.4029 (inf) loss_scale 1024.0000 (1186.6721) mem 22339MB +[2024-07-25 05:26:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][500/625] eta 0:01:13 lr 0.000667 wd 0.0500 time 0.5720 (0.5902) data time 0.0006 (0.0016) model time 0.5714 (0.5857) loss 6.0177 (7.4243) grad_norm 2.2108 (inf) loss_scale 1024.0000 (1183.4251) mem 22339MB +[2024-07-25 05:27:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][510/625] eta 0:01:07 lr 0.000667 wd 0.0500 time 0.5751 (0.5899) data time 0.0008 (0.0016) model time 0.5743 (0.5855) loss 7.6498 (7.4259) grad_norm 2.5431 (inf) loss_scale 1024.0000 (1180.3053) mem 22339MB +[2024-07-25 05:27:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][520/625] eta 0:01:01 lr 0.000667 wd 0.0500 time 0.5730 (0.5895) data time 0.0007 (0.0016) model time 0.5724 (0.5851) loss 7.9135 (7.4236) grad_norm 2.9176 (inf) loss_scale 1024.0000 (1177.3052) mem 22339MB +[2024-07-25 05:27:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][530/625] eta 0:00:55 lr 0.000667 wd 0.0500 time 0.5636 (0.5893) data time 0.0006 (0.0016) model time 0.5630 (0.5849) loss 5.6792 (7.4226) grad_norm 2.3431 (inf) loss_scale 1024.0000 (1174.4181) mem 22339MB +[2024-07-25 05:27:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][540/625] eta 0:00:50 lr 0.000667 wd 0.0500 time 0.5730 (0.5890) data time 0.0006 (0.0015) model time 0.5723 (0.5847) loss 6.5401 (7.4250) grad_norm 2.9789 (inf) loss_scale 1024.0000 (1171.6377) mem 22339MB +[2024-07-25 05:27:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][550/625] eta 0:00:44 lr 0.000667 wd 0.0500 time 0.5736 (0.5888) data time 0.0008 (0.0015) model time 0.5727 (0.5845) loss 7.2418 (7.4210) grad_norm 3.2823 (inf) loss_scale 1024.0000 (1168.9583) mem 22339MB +[2024-07-25 05:27:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][560/625] eta 0:00:38 lr 0.000667 wd 0.0500 time 0.5687 (0.5886) data time 0.0008 (0.0015) model time 0.5679 (0.5843) loss 7.3707 (7.4149) grad_norm 3.0025 (inf) loss_scale 1024.0000 (1166.3743) mem 22339MB +[2024-07-25 05:27:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][570/625] eta 0:00:32 lr 0.000666 wd 0.0500 time 0.5729 (0.5885) data time 0.0008 (0.0015) model time 0.5721 (0.5843) loss 7.9925 (7.4126) grad_norm 1.7584 (inf) loss_scale 1024.0000 (1163.8809) mem 22339MB +[2024-07-25 05:27:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][580/625] eta 0:00:26 lr 0.000666 wd 0.0500 time 0.5694 (0.5882) data time 0.0006 (0.0015) model time 0.5688 (0.5841) loss 6.2907 (7.4198) grad_norm 1.8022 (inf) loss_scale 1024.0000 (1161.4733) mem 22339MB +[2024-07-25 05:27:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][590/625] eta 0:00:20 lr 0.000666 wd 0.0500 time 0.5703 (0.5880) data time 0.0006 (0.0015) model time 0.5697 (0.5839) loss 8.1311 (7.4236) grad_norm 1.9530 (inf) loss_scale 1024.0000 (1159.1472) mem 22339MB +[2024-07-25 05:27:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][600/625] eta 0:00:14 lr 0.000666 wd 0.0500 time 0.5754 (0.5878) data time 0.0008 (0.0015) model time 0.5747 (0.5838) loss 8.0828 (7.4285) grad_norm 2.1013 (inf) loss_scale 1024.0000 (1156.8985) mem 22339MB +[2024-07-25 05:27:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][610/625] eta 0:00:08 lr 0.000666 wd 0.0500 time 0.5628 (0.5876) data time 0.0005 (0.0015) model time 0.5622 (0.5836) loss 7.6551 (7.4300) grad_norm 2.3528 (inf) loss_scale 1024.0000 (1154.7234) mem 22339MB +[2024-07-25 05:28:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [150/300][620/625] eta 0:00:02 lr 0.000666 wd 0.0500 time 0.5701 (0.5874) data time 0.0004 (0.0015) model time 0.5698 (0.5834) loss 8.8987 (7.4390) grad_norm 1.9084 (inf) loss_scale 1024.0000 (1152.6184) mem 22339MB +[2024-07-25 05:28:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 150 training takes 0:06:07 +[2024-07-25 05:28:07 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 05:28:09 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 05:28:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.474 (0.474) Loss 0.5303 (0.5303) Acc@1 89.111 (89.111) Acc@5 98.633 (98.633) Mem 22339MB +[2024-07-25 05:28:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.8115 (0.6576) Acc@1 80.957 (86.071) Acc@5 95.947 (97.559) Mem 22339MB +[2024-07-25 05:28:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9292 (0.7677) Acc@1 77.637 (82.801) Acc@5 95.117 (96.517) Mem 22339MB +[2024-07-25 05:28:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.466 Acc@5 96.497 +[2024-07-25 05:28:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.5% +[2024-07-25 05:28:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.805 (0.805) Loss 0.4934 (0.4934) Acc@1 89.697 (89.697) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 05:28:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.187) Loss 0.7739 (0.6241) Acc@1 81.396 (86.661) Acc@5 96.436 (97.749) Mem 22339MB +[2024-07-25 05:28:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.158) Loss 0.9023 (0.7276) Acc@1 78.076 (83.498) Acc@5 95.459 (96.731) Mem 22339MB +[2024-07-25 05:28:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.151 Acc@5 96.745 +[2024-07-25 05:28:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.2% +[2024-07-25 05:28:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.15% +[2024-07-25 05:28:16 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 05:28:18 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 05:28:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][0/625] eta 0:08:44 lr 0.000666 wd 0.0500 time 0.8396 (0.8396) data time 0.3211 (0.3211) model time 0.0000 (0.0000) loss 7.3700 (7.3700) grad_norm 2.4951 (2.4951) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:28:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][10/625] eta 0:06:08 lr 0.000666 wd 0.0500 time 0.5725 (0.5993) data time 0.0008 (0.0299) model time 0.0000 (0.0000) loss 8.4426 (7.6121) grad_norm 1.8012 (2.6053) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:28:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][20/625] eta 0:05:55 lr 0.000666 wd 0.0500 time 0.5742 (0.5873) data time 0.0008 (0.0160) model time 0.0000 (0.0000) loss 6.5678 (7.5244) grad_norm 2.1361 (2.3789) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:28:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][30/625] eta 0:05:47 lr 0.000666 wd 0.0500 time 0.5693 (0.5835) data time 0.0008 (0.0111) model time 0.0000 (0.0000) loss 7.8064 (7.5870) grad_norm 2.0653 (2.3426) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:28:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][40/625] eta 0:05:40 lr 0.000665 wd 0.0500 time 0.5735 (0.5823) data time 0.0006 (0.0086) model time 0.0000 (0.0000) loss 8.1730 (7.6119) grad_norm 2.0351 (2.4477) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:28:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][50/625] eta 0:05:37 lr 0.000665 wd 0.0500 time 0.7314 (0.5871) data time 0.0007 (0.0070) model time 0.0000 (0.0000) loss 7.6440 (7.5465) grad_norm 2.2305 (2.4156) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:28:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][60/625] eta 0:05:35 lr 0.000665 wd 0.0500 time 0.7114 (0.5931) data time 0.0008 (0.0060) model time 0.7107 (0.6224) loss 7.3963 (7.4776) grad_norm 2.0445 (2.3571) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:29:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][70/625] eta 0:05:32 lr 0.000665 wd 0.0500 time 0.7362 (0.5994) data time 0.0009 (0.0053) model time 0.7353 (0.6299) loss 8.5920 (7.5622) grad_norm 1.9650 (2.3307) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:29:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][80/625] eta 0:05:29 lr 0.000665 wd 0.0500 time 0.7368 (0.6046) data time 0.0007 (0.0047) model time 0.7361 (0.6336) loss 7.9722 (7.5415) grad_norm 3.8357 (2.3779) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:29:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][90/625] eta 0:05:22 lr 0.000665 wd 0.0500 time 0.5717 (0.6034) data time 0.0009 (0.0043) model time 0.5708 (0.6233) loss 7.9320 (7.5785) grad_norm 1.8740 (2.3494) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:29:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][100/625] eta 0:05:15 lr 0.000665 wd 0.0500 time 0.5717 (0.6005) data time 0.0008 (0.0040) model time 0.5709 (0.6133) loss 7.5018 (7.5399) grad_norm 2.4596 (2.3234) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:29:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][110/625] eta 0:05:08 lr 0.000665 wd 0.0500 time 0.5742 (0.5982) data time 0.0006 (0.0037) model time 0.5736 (0.6068) loss 8.1961 (7.5660) grad_norm 2.1864 (2.3261) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:29:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][120/625] eta 0:05:01 lr 0.000665 wd 0.0500 time 0.5739 (0.5961) data time 0.0008 (0.0034) model time 0.5731 (0.6019) loss 6.3655 (7.5293) grad_norm 2.3671 (2.3042) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:29:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][130/625] eta 0:04:54 lr 0.000665 wd 0.0500 time 0.5736 (0.5944) data time 0.0006 (0.0032) model time 0.5730 (0.5983) loss 7.0275 (7.5357) grad_norm 2.4136 (2.3219) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:29:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][140/625] eta 0:04:47 lr 0.000664 wd 0.0500 time 0.5715 (0.5931) data time 0.0008 (0.0031) model time 0.5707 (0.5957) loss 8.0106 (7.5380) grad_norm 2.1435 (2.3121) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:29:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][150/625] eta 0:04:41 lr 0.000664 wd 0.0500 time 0.5738 (0.5921) data time 0.0008 (0.0029) model time 0.5730 (0.5938) loss 8.2493 (7.5572) grad_norm 1.9297 (2.3117) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:29:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][160/625] eta 0:04:34 lr 0.000664 wd 0.0500 time 0.5728 (0.5911) data time 0.0006 (0.0028) model time 0.5722 (0.5921) loss 7.2275 (7.5403) grad_norm 1.8273 (2.2823) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:29:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][170/625] eta 0:04:28 lr 0.000664 wd 0.0500 time 0.5703 (0.5901) data time 0.0006 (0.0027) model time 0.5697 (0.5905) loss 8.4255 (7.5424) grad_norm 2.2278 (2.2572) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:30:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][180/625] eta 0:04:22 lr 0.000664 wd 0.0500 time 0.5699 (0.5894) data time 0.0006 (0.0026) model time 0.5693 (0.5895) loss 6.7296 (7.5165) grad_norm 2.2922 (2.2426) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:30:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][190/625] eta 0:04:16 lr 0.000664 wd 0.0500 time 0.5755 (0.5894) data time 0.0008 (0.0025) model time 0.5747 (0.5894) loss 7.7292 (7.5102) grad_norm 2.3817 (2.2732) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:30:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][200/625] eta 0:04:10 lr 0.000664 wd 0.0500 time 0.5731 (0.5889) data time 0.0008 (0.0024) model time 0.5723 (0.5887) loss 6.9444 (7.4811) grad_norm 2.1675 (2.2884) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:30:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][210/625] eta 0:04:04 lr 0.000664 wd 0.0500 time 0.5745 (0.5883) data time 0.0006 (0.0023) model time 0.5739 (0.5878) loss 6.8637 (7.4662) grad_norm 4.9691 (2.3125) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:30:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][220/625] eta 0:03:58 lr 0.000664 wd 0.0500 time 0.5696 (0.5878) data time 0.0006 (0.0023) model time 0.5689 (0.5871) loss 6.7759 (7.4645) grad_norm 1.6533 (inf) loss_scale 512.0000 (1005.4661) mem 22339MB +[2024-07-25 05:30:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][230/625] eta 0:03:51 lr 0.000663 wd 0.0500 time 0.5718 (0.5872) data time 0.0007 (0.0022) model time 0.5710 (0.5864) loss 7.7335 (7.4615) grad_norm 2.4557 (inf) loss_scale 512.0000 (984.1039) mem 22339MB +[2024-07-25 05:30:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][240/625] eta 0:03:45 lr 0.000663 wd 0.0500 time 0.5754 (0.5867) data time 0.0006 (0.0021) model time 0.5748 (0.5857) loss 7.0524 (7.4499) grad_norm 3.7010 (inf) loss_scale 512.0000 (964.5145) mem 22339MB +[2024-07-25 05:30:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][250/625] eta 0:03:39 lr 0.000663 wd 0.0500 time 0.5732 (0.5862) data time 0.0006 (0.0021) model time 0.5726 (0.5851) loss 6.9280 (7.4598) grad_norm 2.1301 (inf) loss_scale 512.0000 (946.4861) mem 22339MB +[2024-07-25 05:30:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][260/625] eta 0:03:33 lr 0.000663 wd 0.0500 time 0.5712 (0.5861) data time 0.0008 (0.0020) model time 0.5703 (0.5851) loss 8.9318 (7.4465) grad_norm 2.7497 (inf) loss_scale 512.0000 (929.8391) mem 22339MB +[2024-07-25 05:30:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][270/625] eta 0:03:28 lr 0.000663 wd 0.0500 time 0.6706 (0.5865) data time 0.0009 (0.0020) model time 0.6698 (0.5855) loss 7.3206 (7.4472) grad_norm 2.3113 (inf) loss_scale 512.0000 (914.4207) mem 22339MB +[2024-07-25 05:31:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][280/625] eta 0:03:22 lr 0.000663 wd 0.0500 time 0.7373 (0.5877) data time 0.0008 (0.0020) model time 0.7364 (0.5870) loss 8.1290 (7.4369) grad_norm 3.6534 (inf) loss_scale 512.0000 (900.0996) mem 22339MB +[2024-07-25 05:31:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][290/625] eta 0:03:17 lr 0.000663 wd 0.0500 time 0.5920 (0.5895) data time 0.0008 (0.0019) model time 0.5913 (0.5891) loss 6.9173 (7.4203) grad_norm 2.0763 (inf) loss_scale 512.0000 (886.7629) mem 22339MB +[2024-07-25 05:31:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][300/625] eta 0:03:11 lr 0.000663 wd 0.0500 time 0.5634 (0.5907) data time 0.0006 (0.0019) model time 0.5628 (0.5906) loss 7.8458 (7.4273) grad_norm 1.9925 (inf) loss_scale 512.0000 (874.3123) mem 22339MB +[2024-07-25 05:31:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][310/625] eta 0:03:06 lr 0.000663 wd 0.0500 time 0.5734 (0.5914) data time 0.0008 (0.0018) model time 0.5726 (0.5914) loss 7.5534 (7.4379) grad_norm 1.9527 (inf) loss_scale 512.0000 (862.6624) mem 22339MB +[2024-07-25 05:31:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][320/625] eta 0:03:00 lr 0.000662 wd 0.0500 time 0.5752 (0.5909) data time 0.0008 (0.0018) model time 0.5745 (0.5908) loss 6.3584 (7.4314) grad_norm 3.1709 (inf) loss_scale 512.0000 (851.7383) mem 22339MB +[2024-07-25 05:31:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][330/625] eta 0:02:54 lr 0.000662 wd 0.0500 time 0.5673 (0.5904) data time 0.0008 (0.0018) model time 0.5665 (0.5902) loss 6.8035 (7.4310) grad_norm 2.2747 (inf) loss_scale 512.0000 (841.4743) mem 22339MB +[2024-07-25 05:31:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][340/625] eta 0:02:48 lr 0.000662 wd 0.0500 time 0.5715 (0.5899) data time 0.0006 (0.0018) model time 0.5708 (0.5896) loss 7.1986 (7.4368) grad_norm 2.6464 (inf) loss_scale 512.0000 (831.8123) mem 22339MB +[2024-07-25 05:31:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][350/625] eta 0:02:42 lr 0.000662 wd 0.0500 time 0.5717 (0.5895) data time 0.0009 (0.0017) model time 0.5709 (0.5890) loss 7.5309 (7.4388) grad_norm 1.5389 (inf) loss_scale 512.0000 (822.7009) mem 22339MB +[2024-07-25 05:31:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][360/625] eta 0:02:36 lr 0.000662 wd 0.0500 time 0.5707 (0.5890) data time 0.0008 (0.0017) model time 0.5698 (0.5885) loss 5.8817 (7.4310) grad_norm 2.5058 (inf) loss_scale 512.0000 (814.0942) mem 22339MB +[2024-07-25 05:31:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][370/625] eta 0:02:30 lr 0.000662 wd 0.0500 time 0.5742 (0.5887) data time 0.0008 (0.0017) model time 0.5733 (0.5881) loss 6.4857 (7.4440) grad_norm 1.9537 (inf) loss_scale 512.0000 (805.9515) mem 22339MB +[2024-07-25 05:32:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][380/625] eta 0:02:24 lr 0.000662 wd 0.0500 time 0.5581 (0.5883) data time 0.0010 (0.0017) model time 0.5571 (0.5877) loss 9.0019 (7.4509) grad_norm 1.8206 (inf) loss_scale 512.0000 (798.2362) mem 22339MB +[2024-07-25 05:32:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][390/625] eta 0:02:18 lr 0.000662 wd 0.0500 time 0.5728 (0.5880) data time 0.0008 (0.0017) model time 0.5721 (0.5873) loss 8.1298 (7.4588) grad_norm 1.9686 (inf) loss_scale 512.0000 (790.9156) mem 22339MB +[2024-07-25 05:32:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][400/625] eta 0:02:12 lr 0.000662 wd 0.0500 time 0.5705 (0.5876) data time 0.0008 (0.0016) model time 0.5697 (0.5869) loss 7.4013 (7.4612) grad_norm 2.4923 (inf) loss_scale 512.0000 (783.9601) mem 22339MB +[2024-07-25 05:32:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][410/625] eta 0:02:06 lr 0.000662 wd 0.0500 time 0.5714 (0.5877) data time 0.0009 (0.0016) model time 0.5705 (0.5869) loss 7.3314 (7.4637) grad_norm 2.3456 (inf) loss_scale 512.0000 (777.3431) mem 22339MB +[2024-07-25 05:32:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][420/625] eta 0:02:00 lr 0.000661 wd 0.0500 time 0.5740 (0.5874) data time 0.0007 (0.0016) model time 0.5733 (0.5866) loss 5.8235 (7.4561) grad_norm 2.2391 (inf) loss_scale 512.0000 (771.0404) mem 22339MB +[2024-07-25 05:32:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][430/625] eta 0:01:54 lr 0.000661 wd 0.0500 time 0.5735 (0.5871) data time 0.0006 (0.0016) model time 0.5729 (0.5862) loss 7.9583 (7.4504) grad_norm 2.7628 (inf) loss_scale 512.0000 (765.0302) mem 22339MB +[2024-07-25 05:32:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][440/625] eta 0:01:48 lr 0.000661 wd 0.0500 time 0.5742 (0.5869) data time 0.0007 (0.0016) model time 0.5735 (0.5860) loss 7.3820 (7.4477) grad_norm 1.8148 (inf) loss_scale 512.0000 (759.2925) mem 22339MB +[2024-07-25 05:32:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][450/625] eta 0:01:42 lr 0.000661 wd 0.0500 time 0.5705 (0.5866) data time 0.0006 (0.0015) model time 0.5700 (0.5857) loss 7.3233 (7.4487) grad_norm 1.8233 (inf) loss_scale 512.0000 (753.8093) mem 22339MB +[2024-07-25 05:32:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][460/625] eta 0:01:36 lr 0.000661 wd 0.0500 time 0.5704 (0.5863) data time 0.0007 (0.0015) model time 0.5697 (0.5854) loss 5.9911 (7.4501) grad_norm 1.8389 (inf) loss_scale 512.0000 (748.5640) mem 22339MB +[2024-07-25 05:32:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][470/625] eta 0:01:30 lr 0.000661 wd 0.0500 time 0.5703 (0.5861) data time 0.0008 (0.0015) model time 0.5695 (0.5851) loss 6.7608 (7.4433) grad_norm 3.1745 (inf) loss_scale 512.0000 (743.5414) mem 22339MB +[2024-07-25 05:33:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][480/625] eta 0:01:24 lr 0.000661 wd 0.0500 time 0.5736 (0.5861) data time 0.0006 (0.0015) model time 0.5730 (0.5852) loss 8.3594 (7.4440) grad_norm 3.6668 (inf) loss_scale 512.0000 (738.7277) mem 22339MB +[2024-07-25 05:33:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][490/625] eta 0:01:19 lr 0.000661 wd 0.0500 time 0.5704 (0.5863) data time 0.0008 (0.0015) model time 0.5696 (0.5854) loss 8.5363 (7.4454) grad_norm 3.4134 (inf) loss_scale 512.0000 (734.1100) mem 22339MB +[2024-07-25 05:33:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][500/625] eta 0:01:13 lr 0.000661 wd 0.0500 time 0.7270 (0.5872) data time 0.0009 (0.0015) model time 0.7261 (0.5864) loss 7.1231 (7.4520) grad_norm 1.9942 (inf) loss_scale 512.0000 (729.6766) mem 22339MB +[2024-07-25 05:33:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][510/625] eta 0:01:07 lr 0.000660 wd 0.0500 time 0.7404 (0.5879) data time 0.0006 (0.0015) model time 0.7397 (0.5872) loss 7.1817 (7.4449) grad_norm 2.5877 (inf) loss_scale 512.0000 (725.4168) mem 22339MB +[2024-07-25 05:33:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][520/625] eta 0:01:01 lr 0.000660 wd 0.0500 time 0.5721 (0.5888) data time 0.0008 (0.0014) model time 0.5713 (0.5882) loss 6.3389 (7.4358) grad_norm 2.8743 (inf) loss_scale 512.0000 (721.3205) mem 22339MB +[2024-07-25 05:33:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][530/625] eta 0:00:55 lr 0.000660 wd 0.0500 time 0.5669 (0.5886) data time 0.0006 (0.0014) model time 0.5662 (0.5879) loss 7.4316 (7.4286) grad_norm 1.6338 (inf) loss_scale 512.0000 (717.3785) mem 22339MB +[2024-07-25 05:33:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][540/625] eta 0:00:50 lr 0.000660 wd 0.0500 time 0.5729 (0.5883) data time 0.0007 (0.0014) model time 0.5723 (0.5876) loss 6.8476 (7.4266) grad_norm 3.0720 (inf) loss_scale 512.0000 (713.5823) mem 22339MB +[2024-07-25 05:33:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][550/625] eta 0:00:44 lr 0.000660 wd 0.0500 time 0.5717 (0.5880) data time 0.0007 (0.0014) model time 0.5711 (0.5873) loss 7.7520 (7.4278) grad_norm 1.8248 (inf) loss_scale 512.0000 (709.9238) mem 22339MB +[2024-07-25 05:33:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][560/625] eta 0:00:38 lr 0.000660 wd 0.0500 time 0.5702 (0.5878) data time 0.0006 (0.0014) model time 0.5696 (0.5870) loss 8.0063 (7.4264) grad_norm 2.3286 (inf) loss_scale 512.0000 (706.3957) mem 22339MB +[2024-07-25 05:33:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][570/625] eta 0:00:32 lr 0.000660 wd 0.0500 time 0.5732 (0.5876) data time 0.0006 (0.0014) model time 0.5726 (0.5868) loss 8.5047 (7.4195) grad_norm 2.7417 (inf) loss_scale 512.0000 (702.9912) mem 22339MB +[2024-07-25 05:33:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][580/625] eta 0:00:26 lr 0.000660 wd 0.0500 time 0.5736 (0.5874) data time 0.0007 (0.0014) model time 0.5729 (0.5866) loss 5.9688 (7.4217) grad_norm 2.0908 (inf) loss_scale 512.0000 (699.7040) mem 22339MB +[2024-07-25 05:34:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][590/625] eta 0:00:20 lr 0.000660 wd 0.0500 time 0.5739 (0.5872) data time 0.0008 (0.0014) model time 0.5731 (0.5863) loss 8.4367 (7.4313) grad_norm 2.6164 (inf) loss_scale 512.0000 (696.5279) mem 22339MB +[2024-07-25 05:34:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][600/625] eta 0:00:14 lr 0.000660 wd 0.0500 time 0.5758 (0.5870) data time 0.0008 (0.0014) model time 0.5750 (0.5861) loss 6.7240 (7.4287) grad_norm 2.1940 (inf) loss_scale 512.0000 (693.4576) mem 22339MB +[2024-07-25 05:34:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][610/625] eta 0:00:08 lr 0.000659 wd 0.0500 time 0.5768 (0.5868) data time 0.0004 (0.0014) model time 0.5764 (0.5859) loss 8.5525 (7.4392) grad_norm 1.4646 (inf) loss_scale 512.0000 (690.4877) mem 22339MB +[2024-07-25 05:34:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [151/300][620/625] eta 0:00:02 lr 0.000659 wd 0.0500 time 0.5704 (0.5866) data time 0.0006 (0.0014) model time 0.5698 (0.5857) loss 6.7494 (7.4347) grad_norm 1.8784 (inf) loss_scale 512.0000 (687.6135) mem 22339MB +[2024-07-25 05:34:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 151 training takes 0:06:06 +[2024-07-25 05:34:25 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 05:34:26 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 05:34:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.498 (0.498) Loss 0.5273 (0.5273) Acc@1 89.453 (89.453) Acc@5 98.633 (98.633) Mem 22339MB +[2024-07-25 05:34:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.159) Loss 0.8438 (0.6530) Acc@1 80.518 (86.102) Acc@5 95.801 (97.692) Mem 22339MB +[2024-07-25 05:34:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.9365 (0.7650) Acc@1 77.930 (82.968) Acc@5 94.873 (96.549) Mem 22339MB +[2024-07-25 05:34:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.632 Acc@5 96.525 +[2024-07-25 05:34:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.6% +[2024-07-25 05:34:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 82.63% +[2024-07-25 05:34:29 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 05:34:31 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 05:34:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.479 (0.479) Loss 0.4937 (0.4937) Acc@1 89.648 (89.648) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-25 05:34:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.7725 (0.6238) Acc@1 81.494 (86.679) Acc@5 96.338 (97.741) Mem 22339MB +[2024-07-25 05:34:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9019 (0.7272) Acc@1 78.125 (83.552) Acc@5 95.459 (96.735) Mem 22339MB +[2024-07-25 05:34:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.193 Acc@5 96.751 +[2024-07-25 05:34:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.2% +[2024-07-25 05:34:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.19% +[2024-07-25 05:34:34 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 05:34:36 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 05:34:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][0/625] eta 0:09:15 lr 0.000659 wd 0.0500 time 0.8884 (0.8884) data time 0.3708 (0.3708) model time 0.0000 (0.0000) loss 6.7807 (6.7807) grad_norm 4.8235 (4.8235) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:34:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][10/625] eta 0:06:13 lr 0.000659 wd 0.0500 time 0.5738 (0.6076) data time 0.0006 (0.0344) model time 0.0000 (0.0000) loss 6.7471 (7.5003) grad_norm 2.2134 (2.6981) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:34:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][20/625] eta 0:05:58 lr 0.000659 wd 0.0500 time 0.5739 (0.5929) data time 0.0008 (0.0184) model time 0.0000 (0.0000) loss 6.4094 (7.2418) grad_norm 3.2163 (2.4461) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:34:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][30/625] eta 0:05:49 lr 0.000659 wd 0.0500 time 0.5697 (0.5875) data time 0.0006 (0.0129) model time 0.0000 (0.0000) loss 7.2303 (7.3416) grad_norm 2.2916 (2.3738) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:35:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][40/625] eta 0:05:42 lr 0.000659 wd 0.0500 time 0.5635 (0.5852) data time 0.0008 (0.0105) model time 0.0000 (0.0000) loss 8.7162 (7.3987) grad_norm 2.5990 (2.3465) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:35:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][50/625] eta 0:05:35 lr 0.000659 wd 0.0500 time 0.5658 (0.5829) data time 0.0007 (0.0086) model time 0.0000 (0.0000) loss 7.3994 (7.4152) grad_norm 2.5365 (2.3065) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:35:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][60/625] eta 0:05:28 lr 0.000659 wd 0.0500 time 0.5722 (0.5817) data time 0.0008 (0.0074) model time 0.5714 (0.5743) loss 5.9997 (7.3696) grad_norm 2.0055 (2.3366) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:35:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][70/625] eta 0:05:23 lr 0.000659 wd 0.0500 time 0.7111 (0.5828) data time 0.0009 (0.0064) model time 0.7102 (0.5816) loss 5.9065 (7.3089) grad_norm 1.5795 (2.3304) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:35:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][80/625] eta 0:05:17 lr 0.000658 wd 0.0500 time 0.5707 (0.5823) data time 0.0006 (0.0057) model time 0.5701 (0.5804) loss 6.7546 (7.2666) grad_norm 1.4588 (2.3120) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:35:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][90/625] eta 0:05:14 lr 0.000658 wd 0.0500 time 0.7447 (0.5884) data time 0.0006 (0.0052) model time 0.7441 (0.5946) loss 6.6364 (7.2390) grad_norm 2.2596 (2.2943) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:35:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][100/625] eta 0:05:10 lr 0.000658 wd 0.0500 time 0.5716 (0.5914) data time 0.0007 (0.0047) model time 0.5709 (0.5991) loss 5.7738 (7.2164) grad_norm 2.1455 (2.2885) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:35:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][110/625] eta 0:05:06 lr 0.000658 wd 0.0500 time 0.7115 (0.5961) data time 0.0008 (0.0044) model time 0.7107 (0.6064) loss 6.0234 (7.2182) grad_norm 2.1864 (2.2647) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:35:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][120/625] eta 0:05:01 lr 0.000658 wd 0.0500 time 0.6155 (0.5967) data time 0.0006 (0.0041) model time 0.6149 (0.6060) loss 7.2663 (7.2498) grad_norm 1.9753 (2.2637) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:35:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][130/625] eta 0:04:54 lr 0.000658 wd 0.0500 time 0.5706 (0.5954) data time 0.0006 (0.0038) model time 0.5700 (0.6025) loss 6.1892 (7.2870) grad_norm 2.0669 (2.2620) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:36:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][140/625] eta 0:04:48 lr 0.000658 wd 0.0500 time 0.5712 (0.5938) data time 0.0007 (0.0036) model time 0.5705 (0.5992) loss 7.9690 (7.3232) grad_norm 2.1740 (2.2585) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:36:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][150/625] eta 0:04:41 lr 0.000658 wd 0.0500 time 0.5719 (0.5925) data time 0.0006 (0.0035) model time 0.5713 (0.5965) loss 9.0069 (7.3561) grad_norm 3.0866 (2.2652) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:36:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][160/625] eta 0:04:35 lr 0.000658 wd 0.0500 time 0.5619 (0.5914) data time 0.0008 (0.0033) model time 0.5612 (0.5945) loss 8.8403 (7.3667) grad_norm 2.5554 (2.2587) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:36:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][170/625] eta 0:04:28 lr 0.000657 wd 0.0500 time 0.5723 (0.5905) data time 0.0006 (0.0032) model time 0.5717 (0.5928) loss 6.8140 (7.3789) grad_norm 3.4364 (2.2739) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:36:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][180/625] eta 0:04:22 lr 0.000657 wd 0.0500 time 0.5660 (0.5897) data time 0.0008 (0.0031) model time 0.5652 (0.5914) loss 8.5199 (7.3803) grad_norm 2.9127 (2.2871) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:36:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][190/625] eta 0:04:16 lr 0.000657 wd 0.0500 time 0.5676 (0.5890) data time 0.0006 (0.0030) model time 0.5670 (0.5902) loss 7.7422 (7.3666) grad_norm 2.2826 (2.2754) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:36:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][200/625] eta 0:04:09 lr 0.000657 wd 0.0500 time 0.5611 (0.5882) data time 0.0006 (0.0029) model time 0.5605 (0.5891) loss 6.9799 (7.3595) grad_norm 2.9155 (2.2891) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:36:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][210/625] eta 0:04:03 lr 0.000657 wd 0.0500 time 0.5613 (0.5875) data time 0.0006 (0.0028) model time 0.5607 (0.5881) loss 7.2226 (7.3993) grad_norm 1.8169 (2.3258) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:36:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][220/625] eta 0:03:57 lr 0.000657 wd 0.0500 time 0.5718 (0.5870) data time 0.0007 (0.0027) model time 0.5712 (0.5872) loss 6.1675 (7.3766) grad_norm 4.3356 (2.3577) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:36:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][230/625] eta 0:03:51 lr 0.000657 wd 0.0500 time 0.5697 (0.5864) data time 0.0008 (0.0026) model time 0.5689 (0.5865) loss 7.7888 (7.3812) grad_norm 2.4184 (2.3670) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:36:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][240/625] eta 0:03:45 lr 0.000657 wd 0.0500 time 0.5629 (0.5860) data time 0.0006 (0.0026) model time 0.5623 (0.5859) loss 7.6756 (7.3903) grad_norm 2.7372 (2.3687) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:37:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][250/625] eta 0:03:39 lr 0.000657 wd 0.0500 time 0.5757 (0.5856) data time 0.0006 (0.0025) model time 0.5751 (0.5854) loss 8.2044 (7.3849) grad_norm 3.4745 (2.4118) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:37:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][260/625] eta 0:03:33 lr 0.000656 wd 0.0500 time 0.5729 (0.5852) data time 0.0006 (0.0024) model time 0.5723 (0.5849) loss 7.3412 (7.3649) grad_norm 2.1280 (2.4231) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:37:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][270/625] eta 0:03:27 lr 0.000656 wd 0.0500 time 0.5745 (0.5849) data time 0.0006 (0.0024) model time 0.5739 (0.5845) loss 6.1960 (7.3568) grad_norm 2.6928 (2.4154) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:37:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][280/625] eta 0:03:21 lr 0.000656 wd 0.0500 time 0.5723 (0.5846) data time 0.0008 (0.0023) model time 0.5715 (0.5840) loss 8.5116 (7.3735) grad_norm 2.4212 (2.4121) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:37:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][290/625] eta 0:03:15 lr 0.000656 wd 0.0500 time 0.5739 (0.5843) data time 0.0006 (0.0023) model time 0.5734 (0.5837) loss 6.2341 (7.3831) grad_norm 2.1259 (2.4044) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:37:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][300/625] eta 0:03:10 lr 0.000656 wd 0.0500 time 0.7385 (0.5850) data time 0.0010 (0.0022) model time 0.7375 (0.5845) loss 6.5073 (7.3978) grad_norm 3.0052 (2.4084) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:37:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][310/625] eta 0:03:04 lr 0.000656 wd 0.0500 time 0.5733 (0.5854) data time 0.0009 (0.0022) model time 0.5724 (0.5849) loss 8.6903 (7.3946) grad_norm 2.0751 (2.4004) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:37:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][320/625] eta 0:02:58 lr 0.000656 wd 0.0500 time 0.5626 (0.5865) data time 0.0007 (0.0021) model time 0.5620 (0.5862) loss 6.7681 (7.3982) grad_norm 2.0254 (2.3978) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:37:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][330/625] eta 0:02:53 lr 0.000656 wd 0.0500 time 0.7273 (0.5882) data time 0.0009 (0.0021) model time 0.7264 (0.5883) loss 7.0023 (7.4087) grad_norm 2.7294 (2.3889) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:37:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][340/625] eta 0:02:47 lr 0.000656 wd 0.0500 time 0.5717 (0.5887) data time 0.0008 (0.0020) model time 0.5708 (0.5888) loss 8.1519 (7.4145) grad_norm 2.2419 (2.3897) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:38:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][350/625] eta 0:02:41 lr 0.000656 wd 0.0500 time 0.5720 (0.5886) data time 0.0008 (0.0020) model time 0.5712 (0.5887) loss 6.5780 (7.4015) grad_norm 2.0482 (2.3830) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:38:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][360/625] eta 0:02:35 lr 0.000655 wd 0.0500 time 0.5731 (0.5883) data time 0.0007 (0.0020) model time 0.5725 (0.5883) loss 9.1749 (7.4164) grad_norm 2.5445 (2.3802) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:38:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][370/625] eta 0:02:29 lr 0.000655 wd 0.0500 time 0.5728 (0.5879) data time 0.0008 (0.0020) model time 0.5720 (0.5878) loss 6.9046 (7.4283) grad_norm 1.9365 (2.3695) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:38:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][380/625] eta 0:02:23 lr 0.000655 wd 0.0500 time 0.5721 (0.5875) data time 0.0007 (0.0019) model time 0.5714 (0.5874) loss 6.4331 (7.4319) grad_norm 2.7698 (2.3663) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:38:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][390/625] eta 0:02:18 lr 0.000655 wd 0.0500 time 0.5761 (0.5873) data time 0.0006 (0.0019) model time 0.5755 (0.5870) loss 7.7782 (7.4344) grad_norm 2.4163 (2.3664) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:38:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][400/625] eta 0:02:12 lr 0.000655 wd 0.0500 time 0.5707 (0.5871) data time 0.0008 (0.0019) model time 0.5699 (0.5868) loss 7.2208 (7.4345) grad_norm 2.1161 (2.3649) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:38:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][410/625] eta 0:02:06 lr 0.000655 wd 0.0500 time 0.5729 (0.5868) data time 0.0008 (0.0018) model time 0.5721 (0.5864) loss 8.7627 (7.4388) grad_norm 2.3250 (2.3614) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:38:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][420/625] eta 0:02:00 lr 0.000655 wd 0.0500 time 0.5755 (0.5865) data time 0.0008 (0.0018) model time 0.5748 (0.5861) loss 7.5184 (7.4387) grad_norm 1.9872 (2.3588) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:38:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][430/625] eta 0:01:54 lr 0.000655 wd 0.0500 time 0.5620 (0.5864) data time 0.0006 (0.0018) model time 0.5614 (0.5860) loss 8.2260 (7.4476) grad_norm 1.8287 (2.3638) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:38:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][440/625] eta 0:01:48 lr 0.000655 wd 0.0500 time 0.5620 (0.5862) data time 0.0006 (0.0018) model time 0.5614 (0.5857) loss 6.4326 (7.4401) grad_norm 2.1089 (2.3603) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:39:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][450/625] eta 0:01:42 lr 0.000654 wd 0.0500 time 0.5747 (0.5860) data time 0.0006 (0.0017) model time 0.5741 (0.5855) loss 6.7981 (7.4431) grad_norm 3.0091 (2.3647) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:39:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][460/625] eta 0:01:36 lr 0.000654 wd 0.0500 time 0.5726 (0.5860) data time 0.0006 (0.0017) model time 0.5720 (0.5855) loss 7.9793 (7.4455) grad_norm 2.6621 (2.3613) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:39:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][470/625] eta 0:01:30 lr 0.000654 wd 0.0500 time 0.5753 (0.5858) data time 0.0007 (0.0017) model time 0.5746 (0.5853) loss 7.8350 (7.4500) grad_norm 3.9010 (2.3644) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:39:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][480/625] eta 0:01:24 lr 0.000654 wd 0.0500 time 0.5731 (0.5856) data time 0.0008 (0.0017) model time 0.5723 (0.5851) loss 7.2427 (7.4501) grad_norm 2.4501 (2.3918) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:39:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][490/625] eta 0:01:19 lr 0.000654 wd 0.0500 time 0.5693 (0.5854) data time 0.0007 (0.0017) model time 0.5686 (0.5848) loss 7.8470 (7.4512) grad_norm 1.9960 (2.3979) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:39:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][500/625] eta 0:01:13 lr 0.000654 wd 0.0500 time 0.5744 (0.5852) data time 0.0008 (0.0017) model time 0.5736 (0.5846) loss 7.9102 (7.4477) grad_norm 1.9081 (2.3920) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:39:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][510/625] eta 0:01:07 lr 0.000654 wd 0.0500 time 0.5712 (0.5850) data time 0.0006 (0.0016) model time 0.5706 (0.5843) loss 7.7578 (7.4556) grad_norm 1.7045 (2.3907) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:39:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][520/625] eta 0:01:01 lr 0.000654 wd 0.0500 time 0.5726 (0.5851) data time 0.0008 (0.0016) model time 0.5718 (0.5844) loss 7.9707 (7.4488) grad_norm 2.2092 (2.3827) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:39:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][530/625] eta 0:00:55 lr 0.000654 wd 0.0500 time 0.5748 (0.5858) data time 0.0008 (0.0016) model time 0.5740 (0.5853) loss 7.5007 (7.4516) grad_norm 2.3217 (2.3799) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:39:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][540/625] eta 0:00:49 lr 0.000654 wd 0.0500 time 0.6785 (0.5862) data time 0.0006 (0.0016) model time 0.6779 (0.5857) loss 5.9636 (7.4446) grad_norm 2.5030 (2.3791) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:40:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][550/625] eta 0:00:44 lr 0.000653 wd 0.0500 time 0.7565 (0.5874) data time 0.0008 (0.0016) model time 0.7557 (0.5870) loss 8.0846 (7.4428) grad_norm 3.5302 (2.3842) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:40:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][560/625] eta 0:00:38 lr 0.000653 wd 0.0500 time 0.5724 (0.5877) data time 0.0008 (0.0016) model time 0.5717 (0.5873) loss 8.2824 (7.4495) grad_norm 1.8531 (2.3893) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:40:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][570/625] eta 0:00:32 lr 0.000653 wd 0.0500 time 0.5730 (0.5878) data time 0.0006 (0.0015) model time 0.5724 (0.5875) loss 7.1751 (7.4448) grad_norm 1.9144 (2.3899) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:40:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][580/625] eta 0:00:26 lr 0.000653 wd 0.0500 time 0.5706 (0.5876) data time 0.0008 (0.0015) model time 0.5698 (0.5872) loss 5.5673 (7.4392) grad_norm 2.2146 (2.3950) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:40:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][590/625] eta 0:00:20 lr 0.000653 wd 0.0500 time 0.5654 (0.5874) data time 0.0008 (0.0015) model time 0.5646 (0.5869) loss 5.9449 (7.4359) grad_norm 1.8128 (2.3872) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:40:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][600/625] eta 0:00:14 lr 0.000653 wd 0.0500 time 0.5751 (0.5872) data time 0.0006 (0.0015) model time 0.5745 (0.5868) loss 6.6733 (7.4372) grad_norm 1.8323 (2.3870) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:40:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][610/625] eta 0:00:08 lr 0.000653 wd 0.0500 time 0.5718 (0.5870) data time 0.0004 (0.0015) model time 0.5714 (0.5865) loss 6.4871 (7.4361) grad_norm 3.4981 (2.3863) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:40:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [152/300][620/625] eta 0:00:02 lr 0.000653 wd 0.0500 time 0.5723 (0.5868) data time 0.0006 (0.0015) model time 0.5717 (0.5863) loss 8.0405 (7.4433) grad_norm 2.8927 (2.3912) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:40:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 152 training takes 0:06:06 +[2024-07-25 05:40:43 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 05:40:44 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 05:40:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.473 (0.473) Loss 0.5103 (0.5103) Acc@1 88.770 (88.770) Acc@5 98.389 (98.389) Mem 22339MB +[2024-07-25 05:40:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.8027 (0.6416) Acc@1 81.592 (85.973) Acc@5 96.094 (97.638) Mem 22339MB +[2024-07-25 05:40:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.9316 (0.7516) Acc@1 77.393 (82.817) Acc@5 95.459 (96.547) Mem 22339MB +[2024-07-25 05:40:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.516 Acc@5 96.547 +[2024-07-25 05:40:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.5% +[2024-07-25 05:40:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.759 (0.759) Loss 0.4934 (0.4934) Acc@1 89.600 (89.600) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-25 05:40:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.184) Loss 0.7715 (0.6236) Acc@1 81.689 (86.692) Acc@5 96.289 (97.736) Mem 22339MB +[2024-07-25 05:40:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.156) Loss 0.9014 (0.7268) Acc@1 78.174 (83.559) Acc@5 95.459 (96.726) Mem 22339MB +[2024-07-25 05:40:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.187 Acc@5 96.743 +[2024-07-25 05:40:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.2% +[2024-07-25 05:40:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][0/625] eta 0:14:44 lr 0.000653 wd 0.0500 time 1.4158 (1.4158) data time 0.4202 (0.4202) model time 0.0000 (0.0000) loss 6.1116 (6.1116) grad_norm 1.6610 (1.6610) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:40:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][10/625] eta 0:06:39 lr 0.000652 wd 0.0500 time 0.5722 (0.6501) data time 0.0006 (0.0390) model time 0.0000 (0.0000) loss 7.4650 (7.5128) grad_norm 2.2219 (2.5144) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:41:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][20/625] eta 0:06:11 lr 0.000652 wd 0.0500 time 0.5795 (0.6137) data time 0.0007 (0.0208) model time 0.0000 (0.0000) loss 8.4674 (7.5445) grad_norm 4.1307 (2.7173) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:41:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][30/625] eta 0:05:57 lr 0.000652 wd 0.0500 time 0.5754 (0.6008) data time 0.0008 (0.0144) model time 0.0000 (0.0000) loss 8.4979 (7.6563) grad_norm 2.8023 (2.6690) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:41:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][40/625] eta 0:05:47 lr 0.000652 wd 0.0500 time 0.5747 (0.5940) data time 0.0006 (0.0110) model time 0.0000 (0.0000) loss 7.0820 (7.5449) grad_norm 2.6132 (2.6824) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:41:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][50/625] eta 0:05:39 lr 0.000652 wd 0.0500 time 0.5747 (0.5900) data time 0.0006 (0.0090) model time 0.0000 (0.0000) loss 7.9508 (7.5609) grad_norm 3.3889 (2.7377) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:41:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][60/625] eta 0:05:31 lr 0.000652 wd 0.0500 time 0.5762 (0.5875) data time 0.0007 (0.0077) model time 0.5755 (0.5741) loss 8.2025 (7.5143) grad_norm 3.4951 (2.7556) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:41:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][70/625] eta 0:05:25 lr 0.000652 wd 0.0500 time 0.5749 (0.5856) data time 0.0008 (0.0067) model time 0.5741 (0.5737) loss 7.3808 (7.5640) grad_norm 1.5009 (2.6481) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:41:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][80/625] eta 0:05:18 lr 0.000652 wd 0.0500 time 0.5759 (0.5841) data time 0.0008 (0.0060) model time 0.5751 (0.5732) loss 7.6311 (7.5399) grad_norm 2.0250 (2.5740) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:41:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][90/625] eta 0:05:11 lr 0.000652 wd 0.0500 time 0.5731 (0.5829) data time 0.0009 (0.0054) model time 0.5722 (0.5729) loss 7.5854 (7.5289) grad_norm 1.9593 (2.5107) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:41:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][100/625] eta 0:05:05 lr 0.000652 wd 0.0500 time 0.5738 (0.5819) data time 0.0008 (0.0049) model time 0.5731 (0.5728) loss 9.1815 (7.5255) grad_norm 1.8531 (2.5028) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:41:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][110/625] eta 0:04:59 lr 0.000651 wd 0.0500 time 0.5764 (0.5820) data time 0.0008 (0.0046) model time 0.5756 (0.5743) loss 6.6684 (7.5267) grad_norm 1.8508 (2.4887) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:42:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][120/625] eta 0:04:54 lr 0.000651 wd 0.0500 time 0.5766 (0.5824) data time 0.0008 (0.0043) model time 0.5758 (0.5761) loss 7.4283 (7.5069) grad_norm 1.9280 (2.5213) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:42:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][130/625] eta 0:04:50 lr 0.000651 wd 0.0500 time 0.7160 (0.5866) data time 0.0008 (0.0040) model time 0.7152 (0.5837) loss 8.0789 (7.5250) grad_norm 2.2612 (2.5044) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:42:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][140/625] eta 0:04:46 lr 0.000651 wd 0.0500 time 0.5763 (0.5904) data time 0.0006 (0.0038) model time 0.5757 (0.5898) loss 6.9202 (7.5383) grad_norm 2.2568 (2.4949) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:42:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][150/625] eta 0:04:42 lr 0.000651 wd 0.0500 time 0.5739 (0.5953) data time 0.0008 (0.0036) model time 0.5731 (0.5972) loss 5.9140 (7.5508) grad_norm 2.2179 (2.4850) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:42:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][160/625] eta 0:04:36 lr 0.000651 wd 0.0500 time 0.7057 (0.5953) data time 0.0006 (0.0034) model time 0.7051 (0.5969) loss 6.8687 (7.5391) grad_norm 2.0372 (2.4726) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:42:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][170/625] eta 0:04:30 lr 0.000651 wd 0.0500 time 0.5760 (0.5950) data time 0.0007 (0.0032) model time 0.5754 (0.5962) loss 6.3479 (7.5292) grad_norm 1.9177 (2.4508) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:42:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][180/625] eta 0:04:24 lr 0.000651 wd 0.0500 time 0.5753 (0.5938) data time 0.0006 (0.0031) model time 0.5747 (0.5945) loss 8.5476 (7.5105) grad_norm 2.5193 (2.4389) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:42:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][190/625] eta 0:04:17 lr 0.000651 wd 0.0500 time 0.5719 (0.5928) data time 0.0006 (0.0030) model time 0.5713 (0.5931) loss 7.5970 (7.5048) grad_norm 2.5698 (2.4236) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:42:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][200/625] eta 0:04:11 lr 0.000650 wd 0.0500 time 0.5737 (0.5919) data time 0.0006 (0.0029) model time 0.5731 (0.5917) loss 7.3733 (7.5194) grad_norm 2.3120 (2.4421) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:42:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][210/625] eta 0:04:05 lr 0.000650 wd 0.0500 time 0.5757 (0.5911) data time 0.0008 (0.0028) model time 0.5749 (0.5906) loss 7.4368 (7.5321) grad_norm 2.8569 (2.4454) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:43:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][220/625] eta 0:03:59 lr 0.000650 wd 0.0500 time 0.5744 (0.5904) data time 0.0007 (0.0027) model time 0.5737 (0.5897) loss 8.2217 (7.5483) grad_norm 3.0180 (2.4468) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:43:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][230/625] eta 0:03:52 lr 0.000650 wd 0.0500 time 0.5735 (0.5897) data time 0.0008 (0.0026) model time 0.5727 (0.5888) loss 8.4439 (7.5364) grad_norm 2.0174 (2.4237) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:43:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][240/625] eta 0:03:46 lr 0.000650 wd 0.0500 time 0.5769 (0.5891) data time 0.0008 (0.0026) model time 0.5762 (0.5880) loss 6.9874 (7.5159) grad_norm 2.2271 (2.4354) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:43:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][250/625] eta 0:03:40 lr 0.000650 wd 0.0500 time 0.5769 (0.5886) data time 0.0006 (0.0025) model time 0.5763 (0.5874) loss 7.3146 (7.5223) grad_norm 1.8290 (2.4296) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:43:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][260/625] eta 0:03:34 lr 0.000650 wd 0.0500 time 0.5748 (0.5880) data time 0.0008 (0.0024) model time 0.5741 (0.5867) loss 7.7206 (7.5228) grad_norm 2.5328 (2.4189) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:43:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][270/625] eta 0:03:28 lr 0.000650 wd 0.0500 time 0.5745 (0.5876) data time 0.0008 (0.0024) model time 0.5738 (0.5862) loss 8.2416 (7.5356) grad_norm 2.3625 (2.4165) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:43:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][280/625] eta 0:03:22 lr 0.000650 wd 0.0500 time 0.5721 (0.5871) data time 0.0010 (0.0023) model time 0.5711 (0.5856) loss 6.4866 (7.5360) grad_norm 2.2032 (2.4126) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:43:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][290/625] eta 0:03:16 lr 0.000650 wd 0.0500 time 0.5759 (0.5868) data time 0.0006 (0.0023) model time 0.5754 (0.5853) loss 8.3012 (7.5312) grad_norm 1.8058 (2.4105) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:43:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][300/625] eta 0:03:10 lr 0.000649 wd 0.0500 time 0.5767 (0.5864) data time 0.0008 (0.0022) model time 0.5759 (0.5849) loss 8.7757 (7.5508) grad_norm 2.1079 (2.4138) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:43:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][310/625] eta 0:03:04 lr 0.000649 wd 0.0500 time 0.5741 (0.5860) data time 0.0006 (0.0022) model time 0.5735 (0.5844) loss 7.6144 (7.5552) grad_norm 1.8364 (2.4144) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:43:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][320/625] eta 0:02:58 lr 0.000649 wd 0.0500 time 0.5733 (0.5857) data time 0.0008 (0.0021) model time 0.5726 (0.5840) loss 8.1914 (7.5580) grad_norm 2.4826 (2.4166) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:44:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][330/625] eta 0:02:52 lr 0.000649 wd 0.0500 time 0.5773 (0.5855) data time 0.0008 (0.0021) model time 0.5766 (0.5839) loss 7.6633 (7.5446) grad_norm 2.1182 (2.4187) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:44:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][340/625] eta 0:02:46 lr 0.000649 wd 0.0500 time 0.7207 (0.5858) data time 0.0008 (0.0021) model time 0.7199 (0.5842) loss 8.1503 (7.5468) grad_norm 1.9751 (2.4119) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:44:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][350/625] eta 0:02:41 lr 0.000649 wd 0.0500 time 0.5730 (0.5866) data time 0.0006 (0.0020) model time 0.5724 (0.5852) loss 7.9617 (7.5619) grad_norm 2.6902 (2.4096) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:44:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][360/625] eta 0:02:35 lr 0.000649 wd 0.0500 time 0.7024 (0.5879) data time 0.0008 (0.0020) model time 0.7016 (0.5867) loss 7.7638 (7.5640) grad_norm 2.3864 (2.4054) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:44:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][370/625] eta 0:02:30 lr 0.000649 wd 0.0500 time 0.6960 (0.5895) data time 0.0008 (0.0020) model time 0.6952 (0.5886) loss 8.1822 (7.5691) grad_norm 5.1435 (2.4141) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:44:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][380/625] eta 0:02:24 lr 0.000649 wd 0.0500 time 0.6702 (0.5897) data time 0.0006 (0.0019) model time 0.6696 (0.5888) loss 7.5731 (7.5694) grad_norm 1.8333 (2.4162) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:44:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][390/625] eta 0:02:18 lr 0.000648 wd 0.0500 time 0.5764 (0.5893) data time 0.0007 (0.0019) model time 0.5757 (0.5884) loss 7.0575 (7.5664) grad_norm 1.8680 (2.4096) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:44:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][400/625] eta 0:02:12 lr 0.000648 wd 0.0500 time 0.5762 (0.5889) data time 0.0006 (0.0019) model time 0.5755 (0.5880) loss 6.2091 (7.5585) grad_norm 2.2065 (2.4082) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:44:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][410/625] eta 0:02:06 lr 0.000648 wd 0.0500 time 0.5747 (0.5886) data time 0.0008 (0.0018) model time 0.5739 (0.5876) loss 7.6960 (7.5573) grad_norm 2.0080 (2.4011) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:44:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][420/625] eta 0:02:00 lr 0.000648 wd 0.0500 time 0.5741 (0.5883) data time 0.0007 (0.0018) model time 0.5734 (0.5872) loss 8.2395 (7.5493) grad_norm 1.8514 (2.3920) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:45:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][430/625] eta 0:01:54 lr 0.000648 wd 0.0500 time 0.5759 (0.5879) data time 0.0006 (0.0018) model time 0.5753 (0.5868) loss 7.9528 (7.5448) grad_norm 3.0717 (2.3957) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:45:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][440/625] eta 0:01:48 lr 0.000648 wd 0.0500 time 0.5742 (0.5876) data time 0.0008 (0.0018) model time 0.5733 (0.5865) loss 7.0978 (7.5515) grad_norm 2.6756 (2.4114) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:45:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][450/625] eta 0:01:42 lr 0.000648 wd 0.0500 time 0.5781 (0.5873) data time 0.0006 (0.0017) model time 0.5775 (0.5862) loss 7.2680 (7.5429) grad_norm 1.9850 (2.4199) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:45:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][460/625] eta 0:01:36 lr 0.000648 wd 0.0500 time 0.5695 (0.5871) data time 0.0008 (0.0017) model time 0.5687 (0.5859) loss 8.0898 (7.5485) grad_norm 2.2360 (2.4154) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:45:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][470/625] eta 0:01:30 lr 0.000648 wd 0.0500 time 0.5750 (0.5868) data time 0.0007 (0.0017) model time 0.5743 (0.5856) loss 7.4761 (7.5415) grad_norm 1.8524 (2.4134) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:45:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][480/625] eta 0:01:25 lr 0.000648 wd 0.0500 time 0.5734 (0.5866) data time 0.0006 (0.0017) model time 0.5728 (0.5853) loss 8.2338 (7.5404) grad_norm 2.1459 (2.4047) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:45:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][490/625] eta 0:01:19 lr 0.000647 wd 0.0500 time 0.5741 (0.5863) data time 0.0006 (0.0017) model time 0.5735 (0.5851) loss 9.0615 (7.5416) grad_norm 1.7600 (2.4036) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:45:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][500/625] eta 0:01:13 lr 0.000647 wd 0.0500 time 0.5729 (0.5861) data time 0.0008 (0.0017) model time 0.5721 (0.5848) loss 6.4095 (7.5485) grad_norm 1.9410 (2.3965) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:45:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][510/625] eta 0:01:07 lr 0.000647 wd 0.0500 time 0.5705 (0.5859) data time 0.0007 (0.0016) model time 0.5698 (0.5846) loss 6.1998 (7.5420) grad_norm 2.2358 (2.3950) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:45:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][520/625] eta 0:01:01 lr 0.000647 wd 0.0500 time 0.5748 (0.5857) data time 0.0008 (0.0016) model time 0.5739 (0.5844) loss 6.3104 (7.5362) grad_norm 1.9799 (2.3870) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:46:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][530/625] eta 0:00:55 lr 0.000647 wd 0.0500 time 0.5856 (0.5855) data time 0.0006 (0.0016) model time 0.5851 (0.5842) loss 8.1005 (7.5446) grad_norm 1.7557 (2.3868) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:46:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][540/625] eta 0:00:49 lr 0.000647 wd 0.0500 time 0.5731 (0.5853) data time 0.0008 (0.0016) model time 0.5723 (0.5840) loss 6.2724 (7.5403) grad_norm 2.7496 (2.3990) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:46:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][550/625] eta 0:00:43 lr 0.000647 wd 0.0500 time 0.7561 (0.5855) data time 0.0008 (0.0016) model time 0.7552 (0.5842) loss 6.9700 (7.5379) grad_norm 2.0919 (2.4018) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:46:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][560/625] eta 0:00:38 lr 0.000647 wd 0.0500 time 0.7470 (0.5857) data time 0.0006 (0.0016) model time 0.7464 (0.5845) loss 6.1840 (7.5299) grad_norm 1.6231 (2.4026) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:46:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][570/625] eta 0:00:32 lr 0.000647 wd 0.0500 time 0.6929 (0.5863) data time 0.0009 (0.0016) model time 0.6920 (0.5851) loss 6.4614 (7.5268) grad_norm 1.8128 (2.4052) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:46:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][580/625] eta 0:00:26 lr 0.000646 wd 0.0500 time 0.7437 (0.5871) data time 0.0007 (0.0015) model time 0.7430 (0.5860) loss 8.0931 (7.5298) grad_norm 2.9683 (2.4053) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:46:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][590/625] eta 0:00:20 lr 0.000646 wd 0.0500 time 0.7350 (0.5885) data time 0.0006 (0.0015) model time 0.7343 (0.5875) loss 8.2731 (7.5284) grad_norm 2.8076 (2.4153) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:46:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][600/625] eta 0:00:14 lr 0.000646 wd 0.0500 time 0.6571 (0.5887) data time 0.0007 (0.0015) model time 0.6564 (0.5877) loss 8.2750 (7.5250) grad_norm 2.2249 (2.4204) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:46:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][610/625] eta 0:00:08 lr 0.000646 wd 0.0500 time 0.5750 (0.5884) data time 0.0006 (0.0015) model time 0.5744 (0.5875) loss 6.0271 (7.5288) grad_norm 1.6341 (2.4238) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:46:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [153/300][620/625] eta 0:00:02 lr 0.000646 wd 0.0500 time 0.5729 (0.5882) data time 0.0005 (0.0015) model time 0.5724 (0.5872) loss 9.0197 (7.5336) grad_norm 3.4353 (2.4260) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:46:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 153 training takes 0:06:07 +[2024-07-25 05:46:59 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 05:47:00 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 05:47:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.475 (0.475) Loss 0.5420 (0.5420) Acc@1 88.818 (88.818) Acc@5 98.730 (98.730) Mem 22339MB +[2024-07-25 05:47:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.157) Loss 0.8291 (0.6710) Acc@1 81.006 (85.995) Acc@5 96.045 (97.643) Mem 22339MB +[2024-07-25 05:47:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9395 (0.7789) Acc@1 77.100 (82.812) Acc@5 95.508 (96.526) Mem 22339MB +[2024-07-25 05:47:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.542 Acc@5 96.517 +[2024-07-25 05:47:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.5% +[2024-07-25 05:47:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.777 (0.777) Loss 0.4934 (0.4934) Acc@1 89.648 (89.648) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-25 05:47:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.186) Loss 0.7705 (0.6236) Acc@1 81.689 (86.714) Acc@5 96.240 (97.745) Mem 22339MB +[2024-07-25 05:47:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.157) Loss 0.8999 (0.7264) Acc@1 78.125 (83.566) Acc@5 95.508 (96.754) Mem 22339MB +[2024-07-25 05:47:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.199 Acc@5 96.761 +[2024-07-25 05:47:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.2% +[2024-07-25 05:47:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.20% +[2024-07-25 05:47:08 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 05:47:09 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 05:47:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][0/625] eta 0:09:09 lr 0.000646 wd 0.0500 time 0.8789 (0.8789) data time 0.3616 (0.3616) model time 0.0000 (0.0000) loss 6.1114 (6.1114) grad_norm 3.4590 (3.4590) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:47:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][10/625] eta 0:06:10 lr 0.000646 wd 0.0500 time 0.5726 (0.6019) data time 0.0008 (0.0335) model time 0.0000 (0.0000) loss 6.6474 (7.6274) grad_norm 2.0084 (2.4125) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:47:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][20/625] eta 0:05:58 lr 0.000646 wd 0.0500 time 0.5743 (0.5919) data time 0.0008 (0.0180) model time 0.0000 (0.0000) loss 6.1194 (7.3139) grad_norm 1.7376 (2.3044) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:47:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][30/625] eta 0:05:48 lr 0.000646 wd 0.0500 time 0.5704 (0.5860) data time 0.0008 (0.0124) model time 0.0000 (0.0000) loss 6.6981 (7.2286) grad_norm 2.0467 (2.3451) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:47:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][40/625] eta 0:05:40 lr 0.000646 wd 0.0500 time 0.5616 (0.5829) data time 0.0008 (0.0096) model time 0.0000 (0.0000) loss 8.6616 (7.2691) grad_norm 2.2912 (2.5201) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:47:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][50/625] eta 0:05:34 lr 0.000645 wd 0.0500 time 0.5727 (0.5817) data time 0.0006 (0.0079) model time 0.0000 (0.0000) loss 8.2419 (7.2881) grad_norm 2.3917 (2.4455) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:47:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][60/625] eta 0:05:28 lr 0.000645 wd 0.0500 time 0.5727 (0.5807) data time 0.0006 (0.0067) model time 0.5721 (0.5750) loss 7.3671 (7.2989) grad_norm 1.7139 (2.3846) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:47:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][70/625] eta 0:05:21 lr 0.000645 wd 0.0500 time 0.5735 (0.5799) data time 0.0006 (0.0059) model time 0.5730 (0.5745) loss 6.0997 (7.3404) grad_norm 2.6223 (2.3791) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:47:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][80/625] eta 0:05:15 lr 0.000645 wd 0.0500 time 0.5747 (0.5795) data time 0.0006 (0.0052) model time 0.5740 (0.5750) loss 6.7387 (7.3771) grad_norm 2.5319 (2.4144) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:48:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][90/625] eta 0:05:09 lr 0.000645 wd 0.0500 time 0.5630 (0.5791) data time 0.0006 (0.0048) model time 0.5624 (0.5750) loss 7.8143 (7.4108) grad_norm 2.8178 (2.4289) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:48:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][100/625] eta 0:05:04 lr 0.000645 wd 0.0500 time 0.5687 (0.5800) data time 0.0008 (0.0044) model time 0.5679 (0.5775) loss 8.6345 (7.3975) grad_norm 8.3833 (2.4928) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:48:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][110/625] eta 0:04:58 lr 0.000645 wd 0.0500 time 0.5648 (0.5798) data time 0.0008 (0.0041) model time 0.5640 (0.5773) loss 6.6292 (7.4000) grad_norm 3.5054 (2.5179) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:48:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][120/625] eta 0:04:52 lr 0.000645 wd 0.0500 time 0.5741 (0.5798) data time 0.0008 (0.0038) model time 0.5733 (0.5776) loss 9.2962 (7.4130) grad_norm 2.8055 (2.5342) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:48:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][130/625] eta 0:04:46 lr 0.000645 wd 0.0500 time 0.5708 (0.5794) data time 0.0008 (0.0036) model time 0.5700 (0.5772) loss 8.9741 (7.4210) grad_norm 2.2567 (2.5164) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:48:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][140/625] eta 0:04:40 lr 0.000644 wd 0.0500 time 0.5751 (0.5792) data time 0.0009 (0.0034) model time 0.5742 (0.5770) loss 8.8618 (7.4304) grad_norm 3.4467 (2.5366) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:48:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][150/625] eta 0:04:35 lr 0.000644 wd 0.0500 time 0.5759 (0.5794) data time 0.0008 (0.0032) model time 0.5750 (0.5774) loss 8.4234 (7.4594) grad_norm 2.2931 (2.5586) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:48:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][160/625] eta 0:04:30 lr 0.000644 wd 0.0500 time 0.5665 (0.5820) data time 0.0008 (0.0030) model time 0.5657 (0.5813) loss 8.5531 (7.4644) grad_norm 1.7692 (2.5401) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:48:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][170/625] eta 0:04:25 lr 0.000644 wd 0.0500 time 0.5702 (0.5844) data time 0.0008 (0.0029) model time 0.5694 (0.5847) loss 8.4849 (7.4493) grad_norm 2.5865 (2.5467) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:48:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][180/625] eta 0:04:21 lr 0.000644 wd 0.0500 time 0.6791 (0.5880) data time 0.0008 (0.0028) model time 0.6783 (0.5897) loss 6.4914 (7.4396) grad_norm 3.6456 (2.5459) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:49:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][190/625] eta 0:04:16 lr 0.000644 wd 0.0500 time 0.5703 (0.5903) data time 0.0006 (0.0027) model time 0.5696 (0.5927) loss 7.4299 (7.4432) grad_norm 1.9459 (2.5504) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:49:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][200/625] eta 0:04:10 lr 0.000644 wd 0.0500 time 0.5740 (0.5901) data time 0.0009 (0.0026) model time 0.5732 (0.5922) loss 7.6706 (7.4550) grad_norm 2.1667 (2.5647) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:49:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][210/625] eta 0:04:04 lr 0.000644 wd 0.0500 time 0.5751 (0.5894) data time 0.0006 (0.0025) model time 0.5745 (0.5911) loss 6.6525 (7.4447) grad_norm 2.6829 (2.5521) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:49:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][220/625] eta 0:03:58 lr 0.000644 wd 0.0500 time 0.5734 (0.5887) data time 0.0006 (0.0024) model time 0.5728 (0.5900) loss 8.0280 (7.4543) grad_norm 2.2832 (2.5270) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:49:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][230/625] eta 0:03:52 lr 0.000644 wd 0.0500 time 0.5744 (0.5882) data time 0.0007 (0.0024) model time 0.5736 (0.5892) loss 6.8883 (7.4470) grad_norm 3.4860 (2.5251) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:49:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][240/625] eta 0:03:46 lr 0.000643 wd 0.0500 time 0.5727 (0.5876) data time 0.0008 (0.0023) model time 0.5719 (0.5884) loss 7.6663 (7.4358) grad_norm 2.8879 (2.5272) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:49:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][250/625] eta 0:03:40 lr 0.000643 wd 0.0500 time 0.5697 (0.5871) data time 0.0006 (0.0022) model time 0.5691 (0.5877) loss 7.3844 (7.4348) grad_norm 2.7639 (2.5238) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:49:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][260/625] eta 0:03:34 lr 0.000643 wd 0.0500 time 0.5722 (0.5866) data time 0.0006 (0.0022) model time 0.5716 (0.5870) loss 7.5644 (7.4403) grad_norm 2.3490 (2.5289) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:49:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][270/625] eta 0:03:28 lr 0.000643 wd 0.0500 time 0.5720 (0.5862) data time 0.0010 (0.0021) model time 0.5711 (0.5864) loss 6.5123 (7.4526) grad_norm 2.0913 (2.5212) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:49:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][280/625] eta 0:03:22 lr 0.000643 wd 0.0500 time 0.5679 (0.5858) data time 0.0008 (0.0021) model time 0.5670 (0.5859) loss 8.4983 (7.4593) grad_norm 2.1948 (2.5150) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:49:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][290/625] eta 0:03:16 lr 0.000643 wd 0.0500 time 0.5742 (0.5855) data time 0.0007 (0.0020) model time 0.5735 (0.5855) loss 7.6723 (7.4761) grad_norm 1.9834 (2.5030) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:50:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][300/625] eta 0:03:10 lr 0.000643 wd 0.0500 time 0.5625 (0.5852) data time 0.0005 (0.0020) model time 0.5620 (0.5851) loss 8.2600 (7.4896) grad_norm 3.5220 (2.4979) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:50:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][310/625] eta 0:03:04 lr 0.000643 wd 0.0500 time 0.5719 (0.5848) data time 0.0008 (0.0020) model time 0.5711 (0.5847) loss 8.7058 (7.4968) grad_norm 5.0584 (2.5000) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:50:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][320/625] eta 0:02:58 lr 0.000643 wd 0.0500 time 0.5720 (0.5846) data time 0.0008 (0.0019) model time 0.5712 (0.5844) loss 6.5498 (7.4870) grad_norm 1.7311 (2.5019) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:50:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][330/625] eta 0:02:52 lr 0.000642 wd 0.0500 time 0.5739 (0.5843) data time 0.0008 (0.0019) model time 0.5731 (0.5840) loss 7.4274 (7.4876) grad_norm 2.1850 (2.5039) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 05:50:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][340/625] eta 0:02:46 lr 0.000642 wd 0.0500 time 0.5736 (0.5841) data time 0.0007 (0.0018) model time 0.5729 (0.5837) loss 7.9574 (7.4736) grad_norm 2.1786 (2.4971) loss_scale 1024.0000 (516.5044) mem 22339MB +[2024-07-25 05:50:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][350/625] eta 0:02:40 lr 0.000642 wd 0.0500 time 0.5705 (0.5839) data time 0.0006 (0.0018) model time 0.5699 (0.5834) loss 8.4586 (7.4746) grad_norm 2.3557 (2.4930) loss_scale 1024.0000 (530.9630) mem 22339MB +[2024-07-25 05:50:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][360/625] eta 0:02:34 lr 0.000642 wd 0.0500 time 0.5685 (0.5836) data time 0.0009 (0.0018) model time 0.5676 (0.5832) loss 7.4606 (7.4678) grad_norm 2.7477 (2.4835) loss_scale 1024.0000 (544.6205) mem 22339MB +[2024-07-25 05:50:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][370/625] eta 0:02:28 lr 0.000642 wd 0.0500 time 0.5699 (0.5836) data time 0.0006 (0.0018) model time 0.5693 (0.5831) loss 7.4166 (7.4628) grad_norm 2.3951 (2.4968) loss_scale 1024.0000 (557.5418) mem 22339MB +[2024-07-25 05:50:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][380/625] eta 0:02:23 lr 0.000642 wd 0.0500 time 0.6425 (0.5841) data time 0.0007 (0.0018) model time 0.6418 (0.5837) loss 8.5936 (7.4695) grad_norm 2.1252 (2.4963) loss_scale 1024.0000 (569.7848) mem 22339MB +[2024-07-25 05:50:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][390/625] eta 0:02:17 lr 0.000642 wd 0.0500 time 0.6419 (0.5845) data time 0.0006 (0.0017) model time 0.6413 (0.5841) loss 6.6682 (7.4775) grad_norm 2.3197 (2.4870) loss_scale 1024.0000 (581.4015) mem 22339MB +[2024-07-25 05:51:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][400/625] eta 0:02:11 lr 0.000642 wd 0.0500 time 0.7471 (0.5863) data time 0.0006 (0.0017) model time 0.7465 (0.5861) loss 7.0699 (7.4751) grad_norm 1.8871 (2.4778) loss_scale 1024.0000 (592.4389) mem 22339MB +[2024-07-25 05:51:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][410/625] eta 0:02:06 lr 0.000642 wd 0.0500 time 0.5678 (0.5873) data time 0.0008 (0.0017) model time 0.5670 (0.5873) loss 8.7335 (7.4741) grad_norm 4.8652 (2.4754) loss_scale 1024.0000 (602.9392) mem 22339MB +[2024-07-25 05:51:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][420/625] eta 0:02:00 lr 0.000641 wd 0.0500 time 0.5725 (0.5877) data time 0.0006 (0.0017) model time 0.5719 (0.5877) loss 8.4564 (7.4770) grad_norm 2.5058 (2.4773) loss_scale 1024.0000 (612.9406) mem 22339MB +[2024-07-25 05:51:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][430/625] eta 0:01:54 lr 0.000641 wd 0.0500 time 0.5722 (0.5874) data time 0.0006 (0.0016) model time 0.5716 (0.5874) loss 7.7424 (7.4750) grad_norm 1.8178 (2.4728) loss_scale 1024.0000 (622.4780) mem 22339MB +[2024-07-25 05:51:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][440/625] eta 0:01:48 lr 0.000641 wd 0.0500 time 0.5733 (0.5872) data time 0.0008 (0.0016) model time 0.5724 (0.5871) loss 7.4667 (7.4741) grad_norm 1.9626 (2.4671) loss_scale 1024.0000 (631.5828) mem 22339MB +[2024-07-25 05:51:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][450/625] eta 0:01:42 lr 0.000641 wd 0.0500 time 0.5658 (0.5869) data time 0.0007 (0.0016) model time 0.5650 (0.5868) loss 8.3716 (7.4814) grad_norm 1.8537 (2.4662) loss_scale 1024.0000 (640.2838) mem 22339MB +[2024-07-25 05:51:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][460/625] eta 0:01:36 lr 0.000641 wd 0.0500 time 0.5733 (0.5867) data time 0.0008 (0.0016) model time 0.5725 (0.5865) loss 8.4203 (7.4846) grad_norm 1.6668 (2.4648) loss_scale 1024.0000 (648.6074) mem 22339MB +[2024-07-25 05:51:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][470/625] eta 0:01:30 lr 0.000641 wd 0.0500 time 0.5741 (0.5864) data time 0.0008 (0.0016) model time 0.5733 (0.5862) loss 8.4288 (7.4896) grad_norm 1.7719 (2.4552) loss_scale 1024.0000 (656.5775) mem 22339MB +[2024-07-25 05:51:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][480/625] eta 0:01:24 lr 0.000641 wd 0.0500 time 0.5712 (0.5861) data time 0.0008 (0.0016) model time 0.5704 (0.5858) loss 8.3597 (7.4920) grad_norm 2.1236 (2.4534) loss_scale 1024.0000 (664.2162) mem 22339MB +[2024-07-25 05:51:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][490/625] eta 0:01:19 lr 0.000641 wd 0.0500 time 0.5713 (0.5859) data time 0.0006 (0.0015) model time 0.5707 (0.5855) loss 6.1224 (7.4944) grad_norm 2.5735 (2.4524) loss_scale 1024.0000 (671.5438) mem 22339MB +[2024-07-25 05:52:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][500/625] eta 0:01:13 lr 0.000641 wd 0.0500 time 0.5716 (0.5856) data time 0.0006 (0.0015) model time 0.5710 (0.5853) loss 7.5162 (7.4933) grad_norm 3.0336 (2.4719) loss_scale 1024.0000 (678.5788) mem 22339MB +[2024-07-25 05:52:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][510/625] eta 0:01:07 lr 0.000641 wd 0.0500 time 0.5702 (0.5854) data time 0.0008 (0.0015) model time 0.5694 (0.5850) loss 7.2130 (7.4991) grad_norm 2.7788 (2.4736) loss_scale 1024.0000 (685.3386) mem 22339MB +[2024-07-25 05:52:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][520/625] eta 0:01:01 lr 0.000640 wd 0.0500 time 0.5727 (0.5852) data time 0.0006 (0.0015) model time 0.5722 (0.5848) loss 8.5657 (7.5023) grad_norm 2.3186 (2.4723) loss_scale 1024.0000 (691.8388) mem 22339MB +[2024-07-25 05:52:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][530/625] eta 0:00:55 lr 0.000640 wd 0.0500 time 0.5715 (0.5850) data time 0.0008 (0.0015) model time 0.5707 (0.5845) loss 7.4712 (7.5028) grad_norm 3.4279 (2.4863) loss_scale 1024.0000 (698.0942) mem 22339MB +[2024-07-25 05:52:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][540/625] eta 0:00:49 lr 0.000640 wd 0.0500 time 0.5624 (0.5849) data time 0.0006 (0.0015) model time 0.5618 (0.5844) loss 7.8010 (7.4939) grad_norm 3.9283 (2.4956) loss_scale 1024.0000 (704.1183) mem 22339MB +[2024-07-25 05:52:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][550/625] eta 0:00:43 lr 0.000640 wd 0.0500 time 0.5737 (0.5847) data time 0.0006 (0.0015) model time 0.5731 (0.5842) loss 8.0475 (7.4889) grad_norm 2.3472 (2.4902) loss_scale 1024.0000 (709.9238) mem 22339MB +[2024-07-25 05:52:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][560/625] eta 0:00:37 lr 0.000640 wd 0.0500 time 0.5720 (0.5845) data time 0.0006 (0.0014) model time 0.5714 (0.5840) loss 6.5754 (7.4762) grad_norm 2.3346 (2.4863) loss_scale 1024.0000 (715.5223) mem 22339MB +[2024-07-25 05:52:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][570/625] eta 0:00:32 lr 0.000640 wd 0.0500 time 0.5698 (0.5843) data time 0.0006 (0.0014) model time 0.5692 (0.5838) loss 7.9066 (7.4838) grad_norm 1.7472 (2.4801) loss_scale 1024.0000 (720.9247) mem 22339MB +[2024-07-25 05:52:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][580/625] eta 0:00:26 lr 0.000640 wd 0.0500 time 0.5732 (0.5842) data time 0.0008 (0.0014) model time 0.5724 (0.5836) loss 7.7455 (7.4852) grad_norm 3.9529 (2.4826) loss_scale 1024.0000 (726.1411) mem 22339MB +[2024-07-25 05:52:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][590/625] eta 0:00:20 lr 0.000640 wd 0.0500 time 0.5717 (0.5843) data time 0.0008 (0.0014) model time 0.5709 (0.5837) loss 7.7492 (7.4807) grad_norm 2.9131 (2.4851) loss_scale 1024.0000 (731.1810) mem 22339MB +[2024-07-25 05:53:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][600/625] eta 0:00:14 lr 0.000640 wd 0.0500 time 0.7023 (0.5846) data time 0.0008 (0.0014) model time 0.7015 (0.5841) loss 7.8712 (7.4797) grad_norm 2.5946 (2.4821) loss_scale 1024.0000 (736.0532) mem 22339MB +[2024-07-25 05:53:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][610/625] eta 0:00:08 lr 0.000639 wd 0.0500 time 0.7086 (0.5852) data time 0.0006 (0.0014) model time 0.7080 (0.5847) loss 8.2857 (7.4745) grad_norm 1.9420 (2.4746) loss_scale 1024.0000 (740.7660) mem 22339MB +[2024-07-25 05:53:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [154/300][620/625] eta 0:00:02 lr 0.000639 wd 0.0500 time 0.7368 (0.5862) data time 0.0006 (0.0014) model time 0.7362 (0.5858) loss 7.2789 (7.4747) grad_norm 2.0140 (2.4666) loss_scale 1024.0000 (745.3269) mem 22339MB +[2024-07-25 05:53:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 154 training takes 0:06:06 +[2024-07-25 05:53:16 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 05:53:17 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 05:53:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.478 (0.478) Loss 0.5088 (0.5088) Acc@1 89.160 (89.160) Acc@5 98.682 (98.682) Mem 22339MB +[2024-07-25 05:53:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.8247 (0.6520) Acc@1 80.957 (86.048) Acc@5 95.850 (97.576) Mem 22339MB +[2024-07-25 05:53:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9360 (0.7581) Acc@1 76.221 (82.866) Acc@5 95.166 (96.524) Mem 22339MB +[2024-07-25 05:53:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.484 Acc@5 96.499 +[2024-07-25 05:53:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.5% +[2024-07-25 05:53:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.800 (0.800) Loss 0.4932 (0.4932) Acc@1 89.746 (89.746) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-25 05:53:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.187) Loss 0.7700 (0.6232) Acc@1 81.738 (86.745) Acc@5 96.289 (97.749) Mem 22339MB +[2024-07-25 05:53:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.158) Loss 0.8994 (0.7259) Acc@1 78.027 (83.589) Acc@5 95.508 (96.761) Mem 22339MB +[2024-07-25 05:53:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.211 Acc@5 96.767 +[2024-07-25 05:53:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.2% +[2024-07-25 05:53:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.21% +[2024-07-25 05:53:24 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 05:53:26 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 05:53:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][0/625] eta 0:09:43 lr 0.000639 wd 0.0500 time 0.9333 (0.9333) data time 0.4152 (0.4152) model time 0.0000 (0.0000) loss 6.7400 (6.7400) grad_norm 3.6274 (3.6274) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:53:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][10/625] eta 0:06:37 lr 0.000639 wd 0.0500 time 0.7105 (0.6468) data time 0.0006 (0.0385) model time 0.0000 (0.0000) loss 7.3204 (6.8405) grad_norm 3.4575 (2.5726) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:53:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][20/625] eta 0:06:10 lr 0.000639 wd 0.0500 time 0.5736 (0.6132) data time 0.0005 (0.0207) model time 0.0000 (0.0000) loss 5.8663 (7.0969) grad_norm 2.0887 (2.4961) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:53:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][30/625] eta 0:05:57 lr 0.000639 wd 0.0500 time 0.5726 (0.6007) data time 0.0009 (0.0143) model time 0.0000 (0.0000) loss 7.8221 (7.2691) grad_norm 3.1410 (2.3664) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:53:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][40/625] eta 0:05:47 lr 0.000639 wd 0.0500 time 0.5732 (0.5941) data time 0.0007 (0.0110) model time 0.0000 (0.0000) loss 7.7899 (7.3131) grad_norm 2.0041 (2.2759) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:53:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][50/625] eta 0:05:39 lr 0.000639 wd 0.0500 time 0.5710 (0.5906) data time 0.0006 (0.0090) model time 0.0000 (0.0000) loss 7.5900 (7.3914) grad_norm 2.6034 (2.3923) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:54:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][60/625] eta 0:05:32 lr 0.000639 wd 0.0500 time 0.5687 (0.5879) data time 0.0008 (0.0076) model time 0.5679 (0.5734) loss 6.1639 (7.3810) grad_norm 2.8090 (2.3795) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:54:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][70/625] eta 0:05:25 lr 0.000639 wd 0.0500 time 0.5713 (0.5859) data time 0.0006 (0.0067) model time 0.5707 (0.5733) loss 6.5847 (7.3275) grad_norm 3.3647 (2.4013) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:54:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][80/625] eta 0:05:19 lr 0.000638 wd 0.0500 time 0.5663 (0.5863) data time 0.0007 (0.0059) model time 0.5657 (0.5784) loss 7.1771 (7.3439) grad_norm 3.3436 (2.4043) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:54:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][90/625] eta 0:05:13 lr 0.000638 wd 0.0500 time 0.5727 (0.5851) data time 0.0009 (0.0054) model time 0.5718 (0.5773) loss 7.8497 (7.3604) grad_norm 2.8533 (2.3910) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:54:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][100/625] eta 0:05:06 lr 0.000638 wd 0.0500 time 0.5744 (0.5841) data time 0.0007 (0.0049) model time 0.5737 (0.5766) loss 8.3016 (7.4262) grad_norm 2.7627 (2.3934) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:54:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][110/625] eta 0:05:00 lr 0.000638 wd 0.0500 time 0.5749 (0.5834) data time 0.0007 (0.0046) model time 0.5742 (0.5766) loss 9.5199 (7.4038) grad_norm 2.7524 (2.4207) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:54:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][120/625] eta 0:04:54 lr 0.000638 wd 0.0500 time 0.5665 (0.5830) data time 0.0006 (0.0044) model time 0.5659 (0.5764) loss 6.4662 (7.3858) grad_norm 2.0182 (2.4065) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:54:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][130/625] eta 0:04:48 lr 0.000638 wd 0.0500 time 0.5741 (0.5824) data time 0.0006 (0.0041) model time 0.5735 (0.5761) loss 6.5848 (7.3907) grad_norm 2.7034 (2.4490) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:54:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][140/625] eta 0:04:42 lr 0.000638 wd 0.0500 time 0.5715 (0.5824) data time 0.0008 (0.0039) model time 0.5707 (0.5767) loss 7.3468 (7.4161) grad_norm 2.1144 (2.4536) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:54:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][150/625] eta 0:04:36 lr 0.000638 wd 0.0500 time 0.5731 (0.5821) data time 0.0008 (0.0039) model time 0.5722 (0.5765) loss 7.2213 (7.3864) grad_norm 5.1900 (2.4681) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:54:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][160/625] eta 0:04:30 lr 0.000638 wd 0.0500 time 0.5723 (0.5820) data time 0.0008 (0.0037) model time 0.5716 (0.5769) loss 8.1568 (7.3697) grad_norm 3.2015 (2.4794) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:55:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][170/625] eta 0:04:24 lr 0.000637 wd 0.0500 time 0.5727 (0.5817) data time 0.0006 (0.0035) model time 0.5721 (0.5767) loss 7.3874 (7.3632) grad_norm 1.7537 (2.4771) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:55:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][180/625] eta 0:04:18 lr 0.000637 wd 0.0500 time 0.5725 (0.5813) data time 0.0007 (0.0033) model time 0.5718 (0.5765) loss 8.4129 (7.3703) grad_norm 2.3560 (2.4607) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:55:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][190/625] eta 0:04:13 lr 0.000637 wd 0.0500 time 0.5714 (0.5818) data time 0.0008 (0.0032) model time 0.5705 (0.5775) loss 8.4598 (7.3943) grad_norm 2.1716 (2.4427) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:55:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][200/625] eta 0:04:07 lr 0.000637 wd 0.0500 time 0.5731 (0.5834) data time 0.0008 (0.0031) model time 0.5724 (0.5799) loss 5.9215 (7.3923) grad_norm 1.9138 (2.4178) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:55:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][210/625] eta 0:04:03 lr 0.000637 wd 0.0500 time 0.7394 (0.5860) data time 0.0006 (0.0030) model time 0.7388 (0.5835) loss 6.4359 (7.3828) grad_norm 3.3465 (2.4343) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:55:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][220/625] eta 0:03:57 lr 0.000637 wd 0.0500 time 0.7324 (0.5872) data time 0.0006 (0.0029) model time 0.7318 (0.5852) loss 6.4016 (7.3682) grad_norm 1.7160 (2.4436) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:55:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][230/625] eta 0:03:52 lr 0.000637 wd 0.0500 time 0.5707 (0.5874) data time 0.0008 (0.0028) model time 0.5699 (0.5854) loss 7.6023 (7.3699) grad_norm 2.8663 (2.4408) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:55:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][240/625] eta 0:03:45 lr 0.000637 wd 0.0500 time 0.5715 (0.5870) data time 0.0008 (0.0027) model time 0.5707 (0.5850) loss 6.5762 (7.3962) grad_norm 3.5174 (2.4471) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:55:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][250/625] eta 0:03:39 lr 0.000637 wd 0.0500 time 0.5732 (0.5865) data time 0.0007 (0.0026) model time 0.5724 (0.5844) loss 7.6205 (7.4039) grad_norm 3.7517 (2.4520) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:55:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][260/625] eta 0:03:33 lr 0.000637 wd 0.0500 time 0.5698 (0.5860) data time 0.0008 (0.0026) model time 0.5690 (0.5839) loss 7.9649 (7.3933) grad_norm 9.8618 (2.4763) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:56:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][270/625] eta 0:03:27 lr 0.000636 wd 0.0500 time 0.5722 (0.5855) data time 0.0008 (0.0025) model time 0.5714 (0.5833) loss 6.9140 (7.3862) grad_norm 1.9461 (2.4760) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:56:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][280/625] eta 0:03:21 lr 0.000636 wd 0.0500 time 0.5732 (0.5851) data time 0.0006 (0.0024) model time 0.5726 (0.5830) loss 7.0695 (7.3923) grad_norm 2.7962 (2.4762) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:56:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][290/625] eta 0:03:15 lr 0.000636 wd 0.0500 time 0.5756 (0.5847) data time 0.0008 (0.0024) model time 0.5748 (0.5825) loss 6.9807 (7.3948) grad_norm 2.8332 (2.4721) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:56:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][300/625] eta 0:03:10 lr 0.000636 wd 0.0500 time 0.5703 (0.5849) data time 0.0007 (0.0023) model time 0.5696 (0.5827) loss 6.1543 (7.3877) grad_norm 2.1332 (2.4732) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:56:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][310/625] eta 0:03:04 lr 0.000636 wd 0.0500 time 0.5696 (0.5846) data time 0.0007 (0.0023) model time 0.5689 (0.5824) loss 6.5112 (7.3852) grad_norm 2.5968 (2.4797) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:56:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][320/625] eta 0:02:58 lr 0.000636 wd 0.0500 time 0.5714 (0.5842) data time 0.0006 (0.0022) model time 0.5708 (0.5821) loss 7.1814 (7.3855) grad_norm 2.0433 (2.4828) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:56:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][330/625] eta 0:02:52 lr 0.000636 wd 0.0500 time 0.5733 (0.5840) data time 0.0008 (0.0022) model time 0.5725 (0.5819) loss 7.4539 (7.3802) grad_norm 3.4699 (2.4703) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:56:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][340/625] eta 0:02:46 lr 0.000636 wd 0.0500 time 0.5745 (0.5838) data time 0.0006 (0.0021) model time 0.5739 (0.5817) loss 7.9491 (7.3798) grad_norm 3.0221 (2.4722) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:56:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][350/625] eta 0:02:40 lr 0.000636 wd 0.0500 time 0.5730 (0.5835) data time 0.0008 (0.0021) model time 0.5722 (0.5814) loss 7.2589 (7.3866) grad_norm 1.5708 (2.4673) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:56:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][360/625] eta 0:02:34 lr 0.000635 wd 0.0500 time 0.5733 (0.5833) data time 0.0009 (0.0021) model time 0.5725 (0.5811) loss 6.6547 (7.3900) grad_norm 1.8693 (2.4620) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:57:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][370/625] eta 0:02:28 lr 0.000635 wd 0.0500 time 0.5699 (0.5830) data time 0.0008 (0.0020) model time 0.5691 (0.5809) loss 6.7213 (7.3913) grad_norm 2.6278 (2.4582) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:57:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][380/625] eta 0:02:22 lr 0.000635 wd 0.0500 time 0.5714 (0.5828) data time 0.0009 (0.0020) model time 0.5705 (0.5807) loss 7.6612 (7.3852) grad_norm 2.2398 (2.4622) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:57:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][390/625] eta 0:02:16 lr 0.000635 wd 0.0500 time 0.5668 (0.5826) data time 0.0007 (0.0020) model time 0.5662 (0.5805) loss 7.3049 (7.3894) grad_norm 2.1297 (2.4572) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:57:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][400/625] eta 0:02:11 lr 0.000635 wd 0.0500 time 0.5715 (0.5824) data time 0.0006 (0.0019) model time 0.5709 (0.5803) loss 6.3234 (7.3904) grad_norm 2.0526 (2.4568) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:57:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][410/625] eta 0:02:05 lr 0.000635 wd 0.0500 time 0.5715 (0.5827) data time 0.0006 (0.0019) model time 0.5709 (0.5806) loss 7.5843 (7.3963) grad_norm 2.5663 (2.4592) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:57:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][420/625] eta 0:01:59 lr 0.000635 wd 0.0500 time 0.5726 (0.5833) data time 0.0008 (0.0019) model time 0.5718 (0.5814) loss 7.1183 (7.4022) grad_norm 2.8380 (2.4613) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:57:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][430/625] eta 0:01:54 lr 0.000635 wd 0.0500 time 0.7022 (0.5852) data time 0.0006 (0.0019) model time 0.7016 (0.5836) loss 7.7218 (7.4009) grad_norm 2.1409 (2.4579) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:57:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][440/625] eta 0:01:48 lr 0.000635 wd 0.0500 time 0.7075 (0.5863) data time 0.0006 (0.0018) model time 0.7069 (0.5849) loss 7.2049 (7.4035) grad_norm 2.1401 (2.4492) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:57:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][450/625] eta 0:01:42 lr 0.000635 wd 0.0500 time 0.5621 (0.5867) data time 0.0006 (0.0018) model time 0.5615 (0.5853) loss 6.6753 (7.4131) grad_norm 2.7297 (2.4503) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:57:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][460/625] eta 0:01:36 lr 0.000634 wd 0.0500 time 0.5735 (0.5867) data time 0.0006 (0.0018) model time 0.5729 (0.5853) loss 6.0934 (7.4199) grad_norm 1.9090 (2.4472) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:58:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][470/625] eta 0:01:30 lr 0.000634 wd 0.0500 time 0.5825 (0.5864) data time 0.0006 (0.0018) model time 0.5819 (0.5850) loss 7.6894 (7.4210) grad_norm 1.9727 (2.4442) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:58:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][480/625] eta 0:01:24 lr 0.000634 wd 0.0500 time 0.5641 (0.5862) data time 0.0006 (0.0017) model time 0.5635 (0.5848) loss 6.4019 (7.4269) grad_norm 1.6793 (2.4390) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:58:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][490/625] eta 0:01:19 lr 0.000634 wd 0.0500 time 0.5726 (0.5860) data time 0.0006 (0.0017) model time 0.5720 (0.5846) loss 7.8724 (7.4403) grad_norm 3.5138 (2.4331) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:58:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][500/625] eta 0:01:13 lr 0.000634 wd 0.0500 time 0.5775 (0.5858) data time 0.0006 (0.0017) model time 0.5769 (0.5844) loss 7.9257 (7.4454) grad_norm 2.1779 (2.4317) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:58:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][510/625] eta 0:01:07 lr 0.000634 wd 0.0500 time 0.5766 (0.5856) data time 0.0008 (0.0017) model time 0.5757 (0.5841) loss 8.1666 (7.4495) grad_norm 3.9416 (2.4346) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:58:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][520/625] eta 0:01:01 lr 0.000634 wd 0.0500 time 0.5714 (0.5857) data time 0.0008 (0.0017) model time 0.5706 (0.5842) loss 9.2127 (7.4664) grad_norm 2.1764 (2.4280) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:58:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][530/625] eta 0:00:55 lr 0.000634 wd 0.0500 time 0.5730 (0.5855) data time 0.0008 (0.0017) model time 0.5722 (0.5841) loss 7.4983 (7.4626) grad_norm 3.3082 (2.4300) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:58:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][540/625] eta 0:00:49 lr 0.000634 wd 0.0500 time 0.5754 (0.5854) data time 0.0006 (0.0016) model time 0.5749 (0.5839) loss 6.1876 (7.4572) grad_norm 2.5881 (2.4283) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:58:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][550/625] eta 0:00:43 lr 0.000633 wd 0.0500 time 0.5774 (0.5852) data time 0.0008 (0.0016) model time 0.5766 (0.5838) loss 7.3241 (7.4623) grad_norm 2.8427 (2.4279) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:58:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][560/625] eta 0:00:38 lr 0.000633 wd 0.0500 time 0.5713 (0.5850) data time 0.0009 (0.0016) model time 0.5703 (0.5836) loss 8.0383 (7.4559) grad_norm 3.3983 (2.4290) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:59:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][570/625] eta 0:00:32 lr 0.000633 wd 0.0500 time 0.5745 (0.5848) data time 0.0008 (0.0016) model time 0.5738 (0.5834) loss 7.6901 (7.4565) grad_norm 3.1271 (2.4305) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:59:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][580/625] eta 0:00:26 lr 0.000633 wd 0.0500 time 0.5718 (0.5847) data time 0.0007 (0.0016) model time 0.5710 (0.5832) loss 7.2328 (7.4519) grad_norm 3.7760 (2.4409) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:59:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][590/625] eta 0:00:20 lr 0.000633 wd 0.0500 time 0.5719 (0.5845) data time 0.0008 (0.0016) model time 0.5711 (0.5831) loss 6.4671 (7.4517) grad_norm 2.8008 (2.4411) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:59:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][600/625] eta 0:00:14 lr 0.000633 wd 0.0500 time 0.5737 (0.5843) data time 0.0006 (0.0016) model time 0.5731 (0.5829) loss 8.0554 (7.4458) grad_norm 2.4681 (2.4406) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:59:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][610/625] eta 0:00:08 lr 0.000633 wd 0.0500 time 0.5714 (0.5842) data time 0.0006 (0.0016) model time 0.5708 (0.5827) loss 7.4959 (7.4430) grad_norm 1.8828 (2.4394) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:59:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [155/300][620/625] eta 0:00:02 lr 0.000633 wd 0.0500 time 0.5714 (0.5840) data time 0.0005 (0.0015) model time 0.5708 (0.5826) loss 6.9796 (7.4402) grad_norm 2.7658 (2.4322) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:59:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 155 training takes 0:06:05 +[2024-07-25 05:59:31 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 05:59:32 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 05:59:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 1.172 (1.172) Loss 0.4766 (0.4766) Acc@1 90.088 (90.088) Acc@5 98.828 (98.828) Mem 22339MB +[2024-07-25 05:59:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.221) Loss 0.8374 (0.6414) Acc@1 80.859 (86.248) Acc@5 95.996 (97.656) Mem 22339MB +[2024-07-25 05:59:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.175) Loss 0.9180 (0.7542) Acc@1 77.930 (83.126) Acc@5 95.117 (96.489) Mem 22339MB +[2024-07-25 05:59:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.746 Acc@5 96.465 +[2024-07-25 05:59:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.7% +[2024-07-25 05:59:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 82.75% +[2024-07-25 05:59:36 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 05:59:38 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 05:59:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.467 (0.467) Loss 0.4927 (0.4927) Acc@1 89.795 (89.795) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 05:59:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.156) Loss 0.7695 (0.6232) Acc@1 81.592 (86.714) Acc@5 96.338 (97.776) Mem 22339MB +[2024-07-25 05:59:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.141) Loss 0.8989 (0.7256) Acc@1 77.979 (83.573) Acc@5 95.605 (96.798) Mem 22339MB +[2024-07-25 05:59:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.195 Acc@5 96.805 +[2024-07-25 05:59:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.2% +[2024-07-25 05:59:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][0/625] eta 0:15:01 lr 0.000633 wd 0.0500 time 1.4429 (1.4429) data time 0.4300 (0.4300) model time 0.0000 (0.0000) loss 8.2863 (8.2863) grad_norm 1.5554 (1.5554) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:59:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][10/625] eta 0:06:59 lr 0.000633 wd 0.0500 time 0.7252 (0.6823) data time 0.0008 (0.0398) model time 0.0000 (0.0000) loss 7.8052 (7.5752) grad_norm 2.7857 (2.1780) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 05:59:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][20/625] eta 0:06:38 lr 0.000632 wd 0.0500 time 0.7284 (0.6580) data time 0.0008 (0.0212) model time 0.0000 (0.0000) loss 7.7470 (7.7161) grad_norm 3.0584 (2.2409) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:00:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][30/625] eta 0:06:32 lr 0.000632 wd 0.0500 time 0.7006 (0.6598) data time 0.0008 (0.0146) model time 0.0000 (0.0000) loss 7.2168 (7.7082) grad_norm 2.2475 (2.4051) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:00:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][40/625] eta 0:06:25 lr 0.000632 wd 0.0500 time 0.7338 (0.6583) data time 0.0008 (0.0112) model time 0.0000 (0.0000) loss 8.9922 (7.6267) grad_norm 2.4550 (2.5037) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:00:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][50/625] eta 0:06:12 lr 0.000632 wd 0.0500 time 0.5754 (0.6483) data time 0.0008 (0.0092) model time 0.0000 (0.0000) loss 7.3720 (7.6832) grad_norm 2.2208 (2.5122) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:00:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][60/625] eta 0:05:59 lr 0.000632 wd 0.0500 time 0.5732 (0.6363) data time 0.0006 (0.0078) model time 0.5726 (0.5743) loss 8.9821 (7.6227) grad_norm 2.0076 (2.4847) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:00:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][70/625] eta 0:05:48 lr 0.000632 wd 0.0500 time 0.5753 (0.6276) data time 0.0008 (0.0068) model time 0.5745 (0.5742) loss 6.8594 (7.5870) grad_norm 2.9594 (2.4724) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:00:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][80/625] eta 0:05:38 lr 0.000632 wd 0.0500 time 0.5875 (0.6211) data time 0.0008 (0.0061) model time 0.5868 (0.5741) loss 7.3211 (7.5853) grad_norm 1.6458 (2.4079) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:00:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][90/625] eta 0:05:29 lr 0.000632 wd 0.0500 time 0.5756 (0.6160) data time 0.0008 (0.0055) model time 0.5748 (0.5741) loss 6.1855 (7.5394) grad_norm 3.3398 (2.3689) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:00:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][100/625] eta 0:05:21 lr 0.000632 wd 0.0500 time 0.5843 (0.6119) data time 0.0006 (0.0051) model time 0.5837 (0.5740) loss 7.4540 (7.5217) grad_norm 2.6498 (2.3487) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:00:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][110/625] eta 0:05:13 lr 0.000631 wd 0.0500 time 0.5751 (0.6086) data time 0.0008 (0.0047) model time 0.5743 (0.5740) loss 7.5579 (7.4895) grad_norm 2.3920 (2.3500) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:00:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][120/625] eta 0:05:05 lr 0.000631 wd 0.0500 time 0.5746 (0.6058) data time 0.0007 (0.0044) model time 0.5739 (0.5741) loss 5.9873 (7.4908) grad_norm 2.1459 (2.3342) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:01:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][130/625] eta 0:04:58 lr 0.000631 wd 0.0500 time 0.5764 (0.6035) data time 0.0006 (0.0041) model time 0.5758 (0.5741) loss 6.9624 (7.5077) grad_norm 2.2561 (2.3199) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:01:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][140/625] eta 0:04:51 lr 0.000631 wd 0.0500 time 0.5743 (0.6014) data time 0.0007 (0.0038) model time 0.5736 (0.5740) loss 7.1619 (7.4918) grad_norm 2.7389 (2.3316) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:01:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][150/625] eta 0:04:44 lr 0.000631 wd 0.0500 time 0.5828 (0.5997) data time 0.0006 (0.0037) model time 0.5822 (0.5741) loss 6.8856 (7.4935) grad_norm 2.3507 (2.3253) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:01:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][160/625] eta 0:04:38 lr 0.000631 wd 0.0500 time 0.5752 (0.5982) data time 0.0007 (0.0036) model time 0.5745 (0.5740) loss 8.0368 (7.4811) grad_norm 2.7115 (2.3215) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:01:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][170/625] eta 0:04:31 lr 0.000631 wd 0.0500 time 0.5839 (0.5968) data time 0.0008 (0.0034) model time 0.5831 (0.5740) loss 7.1907 (7.4651) grad_norm 2.3043 (2.3152) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:01:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][180/625] eta 0:04:25 lr 0.000631 wd 0.0500 time 0.5806 (0.5957) data time 0.0008 (0.0033) model time 0.5798 (0.5741) loss 7.5589 (7.4664) grad_norm 2.5937 (2.3260) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:01:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][190/625] eta 0:04:18 lr 0.000631 wd 0.0500 time 0.5792 (0.5946) data time 0.0006 (0.0031) model time 0.5785 (0.5742) loss 7.4173 (7.4487) grad_norm 2.9065 (2.3291) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:01:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][200/625] eta 0:04:12 lr 0.000631 wd 0.0500 time 0.5756 (0.5936) data time 0.0006 (0.0030) model time 0.5750 (0.5742) loss 6.0188 (7.4361) grad_norm 3.1663 (2.3259) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:01:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][210/625] eta 0:04:06 lr 0.000630 wd 0.0500 time 0.5813 (0.5929) data time 0.0006 (0.0029) model time 0.5807 (0.5743) loss 7.0288 (7.4358) grad_norm 1.6785 (2.3349) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:01:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][220/625] eta 0:04:00 lr 0.000630 wd 0.0500 time 0.5747 (0.5928) data time 0.0006 (0.0028) model time 0.5740 (0.5752) loss 9.5898 (7.4314) grad_norm 2.5228 (2.3914) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:01:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][230/625] eta 0:03:53 lr 0.000630 wd 0.0500 time 0.5803 (0.5923) data time 0.0008 (0.0027) model time 0.5796 (0.5755) loss 7.7812 (7.4180) grad_norm 2.3560 (2.3837) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:02:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][240/625] eta 0:03:48 lr 0.000630 wd 0.0500 time 0.7463 (0.5948) data time 0.0009 (0.0026) model time 0.7454 (0.5796) loss 8.7651 (7.4233) grad_norm 3.0667 (2.3845) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:02:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][250/625] eta 0:03:43 lr 0.000630 wd 0.0500 time 0.7077 (0.5968) data time 0.0006 (0.0026) model time 0.7071 (0.5828) loss 5.5920 (7.4018) grad_norm 2.7898 (2.3748) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:02:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][260/625] eta 0:03:38 lr 0.000630 wd 0.0500 time 0.6014 (0.5975) data time 0.0008 (0.0025) model time 0.6006 (0.5843) loss 6.2693 (7.4095) grad_norm 3.1679 (2.3768) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:02:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][270/625] eta 0:03:32 lr 0.000630 wd 0.0500 time 0.5738 (0.5975) data time 0.0007 (0.0025) model time 0.5731 (0.5849) loss 6.5477 (7.4271) grad_norm 2.8871 (2.3995) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:02:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][280/625] eta 0:03:25 lr 0.000630 wd 0.0500 time 0.5785 (0.5967) data time 0.0006 (0.0024) model time 0.5779 (0.5844) loss 7.3972 (7.4181) grad_norm 1.7385 (2.3962) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:02:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][290/625] eta 0:03:19 lr 0.000630 wd 0.0500 time 0.5755 (0.5960) data time 0.0008 (0.0023) model time 0.5747 (0.5840) loss 6.5219 (7.3989) grad_norm 3.2818 (2.3938) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:02:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][300/625] eta 0:03:13 lr 0.000629 wd 0.0500 time 0.5750 (0.5953) data time 0.0006 (0.0023) model time 0.5744 (0.5836) loss 6.5453 (7.4034) grad_norm 2.1836 (2.3915) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:02:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][310/625] eta 0:03:07 lr 0.000629 wd 0.0500 time 0.5747 (0.5946) data time 0.0008 (0.0022) model time 0.5738 (0.5832) loss 8.5074 (7.4183) grad_norm 2.3600 (2.3869) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:02:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][320/625] eta 0:03:01 lr 0.000629 wd 0.0500 time 0.5874 (0.5940) data time 0.0007 (0.0022) model time 0.5868 (0.5829) loss 6.5041 (7.4203) grad_norm 2.5013 (2.3880) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:02:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][330/625] eta 0:02:55 lr 0.000629 wd 0.0500 time 0.5769 (0.5934) data time 0.0006 (0.0022) model time 0.5763 (0.5826) loss 8.0641 (7.4295) grad_norm 1.9204 (2.4209) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:03:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][340/625] eta 0:02:48 lr 0.000629 wd 0.0500 time 0.5738 (0.5928) data time 0.0007 (0.0021) model time 0.5732 (0.5822) loss 6.9288 (7.4265) grad_norm 3.2143 (2.4296) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:03:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][350/625] eta 0:02:42 lr 0.000629 wd 0.0500 time 0.5748 (0.5923) data time 0.0008 (0.0021) model time 0.5740 (0.5819) loss 7.2935 (7.4344) grad_norm 2.2305 (2.4195) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:03:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][360/625] eta 0:02:36 lr 0.000629 wd 0.0500 time 0.5770 (0.5918) data time 0.0006 (0.0020) model time 0.5763 (0.5816) loss 6.9974 (7.4302) grad_norm 1.9359 (2.4166) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:03:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][370/625] eta 0:02:30 lr 0.000629 wd 0.0500 time 0.5740 (0.5913) data time 0.0008 (0.0020) model time 0.5733 (0.5814) loss 7.5226 (7.4311) grad_norm 2.0527 (2.4125) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:03:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][380/625] eta 0:02:24 lr 0.000629 wd 0.0500 time 0.5756 (0.5909) data time 0.0006 (0.0020) model time 0.5749 (0.5812) loss 7.6405 (7.4353) grad_norm 3.3222 (2.4279) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:03:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][390/625] eta 0:02:18 lr 0.000628 wd 0.0500 time 0.5796 (0.5905) data time 0.0008 (0.0020) model time 0.5789 (0.5809) loss 8.3847 (7.4264) grad_norm 2.1664 (2.4403) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:03:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][400/625] eta 0:02:12 lr 0.000628 wd 0.0500 time 0.5796 (0.5901) data time 0.0008 (0.0019) model time 0.5788 (0.5808) loss 8.4236 (7.4311) grad_norm 2.4992 (2.4390) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:03:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][410/625] eta 0:02:06 lr 0.000628 wd 0.0500 time 0.5757 (0.5898) data time 0.0007 (0.0019) model time 0.5750 (0.5806) loss 6.7925 (7.4292) grad_norm 3.4010 (2.4514) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:03:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][420/625] eta 0:02:00 lr 0.000628 wd 0.0500 time 0.5773 (0.5894) data time 0.0006 (0.0019) model time 0.5767 (0.5804) loss 8.2404 (7.4302) grad_norm 2.1756 (2.4527) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:03:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][430/625] eta 0:01:54 lr 0.000628 wd 0.0500 time 0.5753 (0.5891) data time 0.0008 (0.0019) model time 0.5745 (0.5802) loss 7.5664 (7.4326) grad_norm 1.7261 (2.4519) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:04:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][440/625] eta 0:01:48 lr 0.000628 wd 0.0500 time 0.5827 (0.5889) data time 0.0006 (0.0018) model time 0.5821 (0.5802) loss 6.8662 (7.4320) grad_norm 1.7957 (2.4427) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:04:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][450/625] eta 0:01:43 lr 0.000628 wd 0.0500 time 0.5747 (0.5887) data time 0.0008 (0.0018) model time 0.5739 (0.5802) loss 8.5366 (7.4373) grad_norm 1.8691 (2.4419) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:04:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][460/625] eta 0:01:37 lr 0.000628 wd 0.0500 time 0.5770 (0.5896) data time 0.0006 (0.0018) model time 0.5764 (0.5814) loss 7.7158 (7.4432) grad_norm 2.0419 (2.4540) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:04:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][470/625] eta 0:01:31 lr 0.000628 wd 0.0500 time 0.5706 (0.5911) data time 0.0006 (0.0018) model time 0.5700 (0.5832) loss 6.4389 (7.4391) grad_norm 2.3354 (2.4661) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:04:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][480/625] eta 0:01:25 lr 0.000628 wd 0.0500 time 0.6900 (0.5920) data time 0.0008 (0.0018) model time 0.6892 (0.5844) loss 8.3376 (7.4444) grad_norm 1.7269 (2.4600) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:04:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][490/625] eta 0:01:19 lr 0.000627 wd 0.0500 time 0.6762 (0.5923) data time 0.0006 (0.0017) model time 0.6755 (0.5849) loss 6.6293 (7.4400) grad_norm 1.9400 (2.4586) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:04:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][500/625] eta 0:01:13 lr 0.000627 wd 0.0500 time 0.5746 (0.5918) data time 0.0008 (0.0017) model time 0.5739 (0.5845) loss 6.5460 (7.4408) grad_norm 2.1206 (2.4636) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:04:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][510/625] eta 0:01:08 lr 0.000627 wd 0.0500 time 0.5994 (0.5915) data time 0.0008 (0.0017) model time 0.5986 (0.5844) loss 8.3557 (7.4408) grad_norm 1.8923 (2.4609) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:04:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][520/625] eta 0:01:02 lr 0.000627 wd 0.0500 time 0.5745 (0.5912) data time 0.0006 (0.0017) model time 0.5739 (0.5841) loss 5.9470 (7.4342) grad_norm 1.8509 (2.4563) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:04:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][530/625] eta 0:00:56 lr 0.000627 wd 0.0500 time 0.5752 (0.5909) data time 0.0009 (0.0017) model time 0.5743 (0.5839) loss 8.6819 (7.4390) grad_norm 2.5138 (2.4550) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:05:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][540/625] eta 0:00:50 lr 0.000627 wd 0.0500 time 0.5764 (0.5906) data time 0.0008 (0.0017) model time 0.5756 (0.5837) loss 9.2748 (7.4413) grad_norm 2.9388 (2.4541) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:05:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][550/625] eta 0:00:44 lr 0.000627 wd 0.0500 time 0.5762 (0.5904) data time 0.0006 (0.0016) model time 0.5756 (0.5836) loss 7.6122 (7.4408) grad_norm 1.7389 (2.4462) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:05:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][560/625] eta 0:00:38 lr 0.000627 wd 0.0500 time 0.5744 (0.5901) data time 0.0008 (0.0016) model time 0.5736 (0.5835) loss 8.1561 (7.4417) grad_norm 2.0896 (2.4506) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:05:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][570/625] eta 0:00:32 lr 0.000627 wd 0.0500 time 0.5760 (0.5899) data time 0.0006 (0.0016) model time 0.5754 (0.5833) loss 7.1887 (7.4355) grad_norm 2.4829 (2.4498) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:05:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][580/625] eta 0:00:26 lr 0.000626 wd 0.0500 time 0.6456 (0.5898) data time 0.0007 (0.0016) model time 0.6448 (0.5833) loss 8.4822 (7.4290) grad_norm 1.5549 (2.4482) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:05:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][590/625] eta 0:00:20 lr 0.000626 wd 0.0500 time 0.5741 (0.5895) data time 0.0006 (0.0016) model time 0.5735 (0.5831) loss 8.2522 (7.4163) grad_norm 1.7682 (2.4490) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:05:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][600/625] eta 0:00:14 lr 0.000626 wd 0.0500 time 0.5799 (0.5893) data time 0.0006 (0.0016) model time 0.5793 (0.5830) loss 8.0931 (7.4143) grad_norm 2.6413 (2.4470) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:05:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][610/625] eta 0:00:08 lr 0.000626 wd 0.0500 time 0.5883 (0.5892) data time 0.0005 (0.0016) model time 0.5878 (0.5829) loss 7.6459 (7.4128) grad_norm 1.5127 (2.4363) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:05:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [156/300][620/625] eta 0:00:02 lr 0.000626 wd 0.0500 time 0.5751 (0.5889) data time 0.0004 (0.0016) model time 0.5748 (0.5828) loss 7.8113 (7.4158) grad_norm 1.6442 (2.4301) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:05:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 156 training takes 0:06:08 +[2024-07-25 06:05:49 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 06:05:51 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 06:05:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.480 (0.480) Loss 0.5103 (0.5103) Acc@1 89.990 (89.990) Acc@5 98.682 (98.682) Mem 22339MB +[2024-07-25 06:05:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.7954 (0.6452) Acc@1 81.396 (86.230) Acc@5 96.533 (97.718) Mem 22339MB +[2024-07-25 06:05:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9497 (0.7562) Acc@1 76.270 (83.003) Acc@5 95.264 (96.640) Mem 22339MB +[2024-07-25 06:05:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.644 Acc@5 96.599 +[2024-07-25 06:05:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.6% +[2024-07-25 06:05:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.849 (0.849) Loss 0.4929 (0.4929) Acc@1 89.844 (89.844) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 06:05:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.191) Loss 0.7686 (0.6233) Acc@1 81.787 (86.776) Acc@5 96.338 (97.785) Mem 22339MB +[2024-07-25 06:05:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.161) Loss 0.8984 (0.7252) Acc@1 78.027 (83.619) Acc@5 95.508 (96.801) Mem 22339MB +[2024-07-25 06:05:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.235 Acc@5 96.801 +[2024-07-25 06:05:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.2% +[2024-07-25 06:05:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.24% +[2024-07-25 06:05:58 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 06:06:00 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 06:06:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][0/625] eta 0:08:59 lr 0.000626 wd 0.0500 time 0.8636 (0.8636) data time 0.3453 (0.3453) model time 0.0000 (0.0000) loss 7.7377 (7.7377) grad_norm 1.6832 (1.6832) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:06:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][10/625] eta 0:06:09 lr 0.000626 wd 0.0500 time 0.5711 (0.6008) data time 0.0006 (0.0322) model time 0.0000 (0.0000) loss 8.5261 (7.6807) grad_norm 1.8306 (2.0052) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:06:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][20/625] eta 0:05:55 lr 0.000626 wd 0.0500 time 0.5714 (0.5880) data time 0.0008 (0.0173) model time 0.0000 (0.0000) loss 5.6800 (7.4104) grad_norm 1.5564 (2.0606) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:06:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][30/625] eta 0:05:50 lr 0.000626 wd 0.0500 time 0.5628 (0.5893) data time 0.0008 (0.0120) model time 0.0000 (0.0000) loss 8.8149 (7.5079) grad_norm 3.1015 (2.2252) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:06:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][40/625] eta 0:05:43 lr 0.000626 wd 0.0500 time 0.5714 (0.5869) data time 0.0006 (0.0095) model time 0.0000 (0.0000) loss 6.3628 (7.5100) grad_norm 2.2443 (2.2476) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:06:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][50/625] eta 0:05:40 lr 0.000625 wd 0.0500 time 0.7247 (0.5915) data time 0.0007 (0.0078) model time 0.0000 (0.0000) loss 7.1523 (7.3630) grad_norm 2.1492 (2.2752) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:06:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][60/625] eta 0:05:35 lr 0.000625 wd 0.0500 time 0.5697 (0.5943) data time 0.0008 (0.0067) model time 0.5688 (0.6077) loss 5.7107 (7.3236) grad_norm 2.2388 (2.2928) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:06:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][70/625] eta 0:05:35 lr 0.000625 wd 0.0500 time 0.7140 (0.6046) data time 0.0008 (0.0059) model time 0.7131 (0.6373) loss 6.9814 (7.3757) grad_norm 2.4938 (2.4025) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:06:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][80/625] eta 0:05:30 lr 0.000625 wd 0.0500 time 0.5714 (0.6066) data time 0.0006 (0.0052) model time 0.5709 (0.6315) loss 7.4164 (7.3968) grad_norm 2.3292 (2.4200) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:06:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][90/625] eta 0:05:23 lr 0.000625 wd 0.0500 time 0.5702 (0.6045) data time 0.0008 (0.0047) model time 0.5693 (0.6202) loss 6.1656 (7.4164) grad_norm 1.9878 (2.3874) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:07:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][100/625] eta 0:05:15 lr 0.000625 wd 0.0500 time 0.5750 (0.6015) data time 0.0009 (0.0044) model time 0.5742 (0.6108) loss 8.1226 (7.3920) grad_norm 2.0559 (2.3938) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:07:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][110/625] eta 0:05:08 lr 0.000625 wd 0.0500 time 0.5736 (0.5989) data time 0.0007 (0.0040) model time 0.5729 (0.6045) loss 5.8502 (7.4140) grad_norm 1.6013 (2.3801) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:07:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][120/625] eta 0:05:01 lr 0.000625 wd 0.0500 time 0.5718 (0.5969) data time 0.0008 (0.0038) model time 0.5710 (0.6000) loss 8.7764 (7.4328) grad_norm 2.3400 (2.3756) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:07:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][130/625] eta 0:04:54 lr 0.000625 wd 0.0500 time 0.5706 (0.5951) data time 0.0008 (0.0036) model time 0.5698 (0.5966) loss 6.7872 (7.4363) grad_norm 2.0076 (2.3805) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:07:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][140/625] eta 0:04:47 lr 0.000624 wd 0.0500 time 0.5687 (0.5937) data time 0.0006 (0.0034) model time 0.5681 (0.5940) loss 6.3464 (7.4240) grad_norm 2.4772 (2.3856) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:07:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][150/625] eta 0:04:41 lr 0.000624 wd 0.0500 time 0.5731 (0.5925) data time 0.0008 (0.0032) model time 0.5722 (0.5922) loss 8.1439 (7.4249) grad_norm 1.6907 (2.3791) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:07:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][160/625] eta 0:04:35 lr 0.000624 wd 0.0500 time 0.5633 (0.5914) data time 0.0008 (0.0031) model time 0.5626 (0.5906) loss 7.3425 (7.4248) grad_norm 3.8539 (2.4990) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:07:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][170/625] eta 0:04:28 lr 0.000624 wd 0.0500 time 0.5722 (0.5905) data time 0.0006 (0.0029) model time 0.5716 (0.5892) loss 7.6018 (7.4218) grad_norm 2.6275 (2.4964) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:07:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][180/625] eta 0:04:22 lr 0.000624 wd 0.0500 time 0.5715 (0.5896) data time 0.0006 (0.0028) model time 0.5709 (0.5880) loss 7.2983 (7.4125) grad_norm 2.1476 (2.4755) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:07:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][190/625] eta 0:04:16 lr 0.000624 wd 0.0500 time 0.5705 (0.5889) data time 0.0006 (0.0027) model time 0.5699 (0.5871) loss 8.3446 (7.4143) grad_norm 1.9282 (2.4500) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:07:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][200/625] eta 0:04:09 lr 0.000624 wd 0.0500 time 0.5745 (0.5882) data time 0.0007 (0.0026) model time 0.5738 (0.5863) loss 6.8934 (7.4041) grad_norm 1.7299 (2.4463) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:08:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][210/625] eta 0:04:03 lr 0.000624 wd 0.0500 time 0.5737 (0.5876) data time 0.0008 (0.0025) model time 0.5729 (0.5855) loss 7.6910 (7.4048) grad_norm 2.5727 (2.4478) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:08:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][220/625] eta 0:03:57 lr 0.000624 wd 0.0500 time 0.5746 (0.5871) data time 0.0008 (0.0024) model time 0.5737 (0.5849) loss 8.6428 (7.3910) grad_norm 2.0536 (2.4550) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:08:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][230/625] eta 0:03:51 lr 0.000624 wd 0.0500 time 0.5812 (0.5866) data time 0.0007 (0.0024) model time 0.5805 (0.5844) loss 7.4145 (7.4020) grad_norm 1.6635 (2.4853) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:08:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][240/625] eta 0:03:45 lr 0.000623 wd 0.0500 time 0.5725 (0.5861) data time 0.0007 (0.0023) model time 0.5717 (0.5838) loss 7.1578 (7.4107) grad_norm 2.5806 (2.4911) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:08:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][250/625] eta 0:03:39 lr 0.000623 wd 0.0500 time 0.5718 (0.5859) data time 0.0006 (0.0022) model time 0.5713 (0.5836) loss 7.2199 (7.4125) grad_norm 1.8919 (2.4904) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:08:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][260/625] eta 0:03:33 lr 0.000623 wd 0.0500 time 0.5747 (0.5861) data time 0.0008 (0.0022) model time 0.5740 (0.5840) loss 7.2681 (7.4105) grad_norm 3.0892 (2.4788) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:08:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][270/625] eta 0:03:28 lr 0.000623 wd 0.0500 time 0.7354 (0.5874) data time 0.0008 (0.0021) model time 0.7346 (0.5856) loss 7.3093 (7.4127) grad_norm 2.7060 (2.4786) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:08:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][280/625] eta 0:03:22 lr 0.000623 wd 0.0500 time 0.7084 (0.5882) data time 0.0008 (0.0021) model time 0.7076 (0.5867) loss 6.4867 (7.4130) grad_norm 1.7544 (2.4726) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:08:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][290/625] eta 0:03:17 lr 0.000623 wd 0.0500 time 0.5905 (0.5895) data time 0.0006 (0.0020) model time 0.5899 (0.5882) loss 6.9104 (7.4121) grad_norm 1.8498 (2.4580) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:08:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][300/625] eta 0:03:11 lr 0.000623 wd 0.0500 time 0.5752 (0.5902) data time 0.0008 (0.0020) model time 0.5744 (0.5892) loss 8.5322 (7.4162) grad_norm 1.4690 (2.4670) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:09:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][310/625] eta 0:03:05 lr 0.000623 wd 0.0500 time 0.5741 (0.5900) data time 0.0008 (0.0020) model time 0.5733 (0.5889) loss 8.7155 (7.4216) grad_norm 2.4449 (2.4746) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:09:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][320/625] eta 0:02:59 lr 0.000623 wd 0.0500 time 0.5723 (0.5895) data time 0.0006 (0.0019) model time 0.5717 (0.5883) loss 8.1956 (7.4329) grad_norm 2.0811 (2.4634) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:09:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][330/625] eta 0:02:53 lr 0.000622 wd 0.0500 time 0.5642 (0.5890) data time 0.0006 (0.0019) model time 0.5636 (0.5877) loss 7.3389 (7.4319) grad_norm 2.0888 (2.4575) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:09:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][340/625] eta 0:02:47 lr 0.000622 wd 0.0500 time 0.5736 (0.5886) data time 0.0006 (0.0019) model time 0.5730 (0.5872) loss 6.8546 (7.4239) grad_norm 2.2168 (2.4629) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:09:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][350/625] eta 0:02:41 lr 0.000622 wd 0.0500 time 0.5743 (0.5882) data time 0.0006 (0.0018) model time 0.5736 (0.5868) loss 6.3959 (7.4212) grad_norm 1.7645 (2.4494) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:09:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][360/625] eta 0:02:35 lr 0.000622 wd 0.0500 time 0.5741 (0.5878) data time 0.0006 (0.0018) model time 0.5735 (0.5864) loss 7.0406 (7.4317) grad_norm 1.6675 (2.4428) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:09:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][370/625] eta 0:02:29 lr 0.000622 wd 0.0500 time 0.5696 (0.5874) data time 0.0006 (0.0018) model time 0.5690 (0.5860) loss 7.1079 (7.4238) grad_norm 2.0163 (2.4314) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:09:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][380/625] eta 0:02:23 lr 0.000622 wd 0.0500 time 0.5773 (0.5871) data time 0.0006 (0.0017) model time 0.5767 (0.5856) loss 7.4077 (7.4287) grad_norm 2.7262 (2.4317) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:09:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][390/625] eta 0:02:17 lr 0.000622 wd 0.0500 time 0.5754 (0.5868) data time 0.0006 (0.0017) model time 0.5748 (0.5853) loss 6.5785 (7.4334) grad_norm 2.1420 (2.4358) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:09:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][400/625] eta 0:02:11 lr 0.000622 wd 0.0500 time 0.5721 (0.5865) data time 0.0006 (0.0017) model time 0.5715 (0.5850) loss 7.8860 (7.4218) grad_norm 4.1923 (2.4388) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:10:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][410/625] eta 0:02:06 lr 0.000622 wd 0.0500 time 0.5710 (0.5862) data time 0.0009 (0.0017) model time 0.5701 (0.5847) loss 6.7103 (7.4268) grad_norm 2.3364 (2.4335) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:10:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][420/625] eta 0:02:00 lr 0.000622 wd 0.0500 time 0.5711 (0.5860) data time 0.0006 (0.0017) model time 0.5705 (0.5844) loss 7.0132 (7.4230) grad_norm 2.2233 (2.4271) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:10:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][430/625] eta 0:01:54 lr 0.000621 wd 0.0500 time 0.5712 (0.5858) data time 0.0007 (0.0016) model time 0.5705 (0.5842) loss 7.9783 (7.4304) grad_norm 2.2331 (2.4259) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:10:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][440/625] eta 0:01:48 lr 0.000621 wd 0.0500 time 0.5753 (0.5855) data time 0.0006 (0.0016) model time 0.5747 (0.5840) loss 6.4468 (7.4281) grad_norm 1.7753 (2.4263) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:10:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][450/625] eta 0:01:42 lr 0.000621 wd 0.0500 time 0.5739 (0.5853) data time 0.0006 (0.0016) model time 0.5733 (0.5837) loss 7.0927 (7.4389) grad_norm 2.4292 (2.4195) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:10:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][460/625] eta 0:01:36 lr 0.000621 wd 0.0500 time 0.5730 (0.5851) data time 0.0009 (0.0016) model time 0.5721 (0.5835) loss 7.0207 (7.4413) grad_norm 1.7387 (2.4074) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:10:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][470/625] eta 0:01:30 lr 0.000621 wd 0.0500 time 0.5725 (0.5850) data time 0.0006 (0.0016) model time 0.5719 (0.5834) loss 7.7214 (7.4513) grad_norm 1.8683 (2.3985) loss_scale 2048.0000 (1041.3928) mem 22339MB +[2024-07-25 06:10:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][480/625] eta 0:01:24 lr 0.000621 wd 0.0500 time 0.5770 (0.5851) data time 0.0008 (0.0016) model time 0.5762 (0.5835) loss 7.5373 (7.4442) grad_norm 3.2782 (2.4043) loss_scale 2048.0000 (1062.3202) mem 22339MB +[2024-07-25 06:10:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][490/625] eta 0:01:19 lr 0.000621 wd 0.0500 time 0.7176 (0.5859) data time 0.0009 (0.0015) model time 0.7168 (0.5844) loss 7.2064 (7.4439) grad_norm 1.8844 (2.4044) loss_scale 2048.0000 (1082.3951) mem 22339MB +[2024-07-25 06:10:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][500/625] eta 0:01:13 lr 0.000621 wd 0.0500 time 0.7407 (0.5866) data time 0.0006 (0.0015) model time 0.7401 (0.5852) loss 7.1615 (7.4479) grad_norm 2.1774 (2.4026) loss_scale 2048.0000 (1101.6687) mem 22339MB +[2024-07-25 06:11:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][510/625] eta 0:01:07 lr 0.000621 wd 0.0500 time 0.7398 (0.5879) data time 0.0008 (0.0015) model time 0.7390 (0.5866) loss 8.6541 (7.4436) grad_norm 2.8048 (2.4020) loss_scale 2048.0000 (1120.1879) mem 22339MB +[2024-07-25 06:11:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][520/625] eta 0:01:01 lr 0.000620 wd 0.0500 time 0.7260 (0.5888) data time 0.0008 (0.0015) model time 0.7252 (0.5877) loss 9.4683 (7.4495) grad_norm 2.4444 (2.4143) loss_scale 2048.0000 (1137.9962) mem 22339MB +[2024-07-25 06:11:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][530/625] eta 0:00:55 lr 0.000620 wd 0.0500 time 0.5747 (0.5886) data time 0.0006 (0.0015) model time 0.5741 (0.5875) loss 7.5277 (7.4520) grad_norm 2.9040 (2.4272) loss_scale 2048.0000 (1155.1337) mem 22339MB +[2024-07-25 06:11:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][540/625] eta 0:00:50 lr 0.000620 wd 0.0500 time 0.5720 (0.5883) data time 0.0006 (0.0015) model time 0.5713 (0.5872) loss 5.4242 (7.4500) grad_norm 2.0969 (2.4397) loss_scale 2048.0000 (1171.6377) mem 22339MB +[2024-07-25 06:11:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][550/625] eta 0:00:44 lr 0.000620 wd 0.0500 time 0.5722 (0.5881) data time 0.0006 (0.0015) model time 0.5716 (0.5869) loss 7.7121 (7.4534) grad_norm 3.0945 (2.4365) loss_scale 2048.0000 (1187.5426) mem 22339MB +[2024-07-25 06:11:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][560/625] eta 0:00:38 lr 0.000620 wd 0.0500 time 0.5743 (0.5879) data time 0.0006 (0.0015) model time 0.5737 (0.5867) loss 8.5889 (7.4540) grad_norm 2.0358 (2.4317) loss_scale 2048.0000 (1202.8806) mem 22339MB +[2024-07-25 06:11:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][570/625] eta 0:00:32 lr 0.000620 wd 0.0500 time 0.5747 (0.5877) data time 0.0007 (0.0014) model time 0.5740 (0.5865) loss 7.3839 (7.4485) grad_norm 1.6192 (2.4244) loss_scale 2048.0000 (1217.6813) mem 22339MB +[2024-07-25 06:11:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][580/625] eta 0:00:26 lr 0.000620 wd 0.0500 time 0.5736 (0.5874) data time 0.0006 (0.0014) model time 0.5730 (0.5862) loss 6.1847 (7.4490) grad_norm 2.0566 (2.4181) loss_scale 2048.0000 (1231.9725) mem 22339MB +[2024-07-25 06:11:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][590/625] eta 0:00:20 lr 0.000620 wd 0.0500 time 0.5743 (0.5872) data time 0.0006 (0.0014) model time 0.5737 (0.5860) loss 6.4853 (7.4566) grad_norm 2.5125 (2.4155) loss_scale 2048.0000 (1245.7800) mem 22339MB +[2024-07-25 06:11:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][600/625] eta 0:00:14 lr 0.000620 wd 0.0500 time 0.5780 (0.5870) data time 0.0006 (0.0014) model time 0.5775 (0.5858) loss 6.8049 (7.4570) grad_norm 3.1066 (2.4240) loss_scale 2048.0000 (1259.1281) mem 22339MB +[2024-07-25 06:11:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][610/625] eta 0:00:08 lr 0.000619 wd 0.0500 time 0.5721 (0.5868) data time 0.0005 (0.0014) model time 0.5716 (0.5856) loss 7.7503 (7.4569) grad_norm 2.4959 (2.4284) loss_scale 2048.0000 (1272.0393) mem 22339MB +[2024-07-25 06:12:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [157/300][620/625] eta 0:00:02 lr 0.000619 wd 0.0500 time 0.5691 (0.5866) data time 0.0007 (0.0014) model time 0.5684 (0.5853) loss 8.8543 (7.4555) grad_norm 4.5018 (2.4358) loss_scale 2048.0000 (1284.5346) mem 22339MB +[2024-07-25 06:12:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 157 training takes 0:06:06 +[2024-07-25 06:12:06 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 06:12:08 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 06:12:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.483 (0.483) Loss 0.5332 (0.5332) Acc@1 89.111 (89.111) Acc@5 98.779 (98.779) Mem 22339MB +[2024-07-25 06:12:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.8276 (0.6591) Acc@1 80.518 (86.195) Acc@5 96.240 (97.701) Mem 22339MB +[2024-07-25 06:12:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.9321 (0.7717) Acc@1 77.734 (82.999) Acc@5 95.166 (96.563) Mem 22339MB +[2024-07-25 06:12:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.632 Acc@5 96.549 +[2024-07-25 06:12:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.6% +[2024-07-25 06:12:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.794 (0.794) Loss 0.4932 (0.4932) Acc@1 89.795 (89.795) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 06:12:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.187) Loss 0.7690 (0.6229) Acc@1 81.787 (86.759) Acc@5 96.338 (97.798) Mem 22339MB +[2024-07-25 06:12:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.158) Loss 0.8975 (0.7248) Acc@1 78.174 (83.617) Acc@5 95.557 (96.817) Mem 22339MB +[2024-07-25 06:12:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.239 Acc@5 96.821 +[2024-07-25 06:12:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.2% +[2024-07-25 06:12:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.24% +[2024-07-25 06:12:15 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 06:12:16 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 06:12:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][0/625] eta 0:10:14 lr 0.000619 wd 0.0500 time 0.9839 (0.9839) data time 0.4660 (0.4660) model time 0.0000 (0.0000) loss 7.6612 (7.6612) grad_norm 2.4215 (2.4215) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:12:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][10/625] eta 0:06:25 lr 0.000619 wd 0.0500 time 0.5620 (0.6274) data time 0.0006 (0.0431) model time 0.0000 (0.0000) loss 5.6177 (7.1092) grad_norm 3.0912 (2.4788) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:12:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][20/625] eta 0:06:05 lr 0.000619 wd 0.0500 time 0.5729 (0.6036) data time 0.0008 (0.0230) model time 0.0000 (0.0000) loss 8.1833 (7.2862) grad_norm 2.7444 (2.4584) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:12:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][30/625] eta 0:05:53 lr 0.000619 wd 0.0500 time 0.5702 (0.5946) data time 0.0006 (0.0158) model time 0.0000 (0.0000) loss 7.4459 (7.3336) grad_norm 2.5804 (2.3686) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:12:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][40/625] eta 0:05:45 lr 0.000619 wd 0.0500 time 0.5699 (0.5899) data time 0.0008 (0.0122) model time 0.0000 (0.0000) loss 8.1399 (7.3707) grad_norm 1.7017 (2.3111) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:12:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][50/625] eta 0:05:37 lr 0.000619 wd 0.0500 time 0.5711 (0.5877) data time 0.0008 (0.0099) model time 0.0000 (0.0000) loss 7.5105 (7.4155) grad_norm 2.7352 (2.2791) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:12:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][60/625] eta 0:05:30 lr 0.000619 wd 0.0500 time 0.5749 (0.5856) data time 0.0008 (0.0085) model time 0.5741 (0.5739) loss 6.3219 (7.2854) grad_norm 3.6241 (2.5228) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:12:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][70/625] eta 0:05:25 lr 0.000619 wd 0.0500 time 0.7372 (0.5864) data time 0.0008 (0.0074) model time 0.7364 (0.5822) loss 7.9695 (7.3369) grad_norm 4.2202 (2.6500) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:13:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][80/625] eta 0:05:18 lr 0.000618 wd 0.0500 time 0.5747 (0.5850) data time 0.0008 (0.0066) model time 0.5738 (0.5796) loss 6.8366 (7.3402) grad_norm 1.6938 (2.6136) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:13:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][90/625] eta 0:05:14 lr 0.000618 wd 0.0500 time 0.5712 (0.5883) data time 0.0006 (0.0059) model time 0.5706 (0.5882) loss 7.6857 (7.3528) grad_norm 2.6299 (2.5717) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:13:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][100/625] eta 0:05:12 lr 0.000618 wd 0.0500 time 0.5744 (0.5953) data time 0.0008 (0.0054) model time 0.5736 (0.6021) loss 7.8352 (7.3542) grad_norm 2.1627 (2.5669) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:13:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][110/625] eta 0:05:09 lr 0.000618 wd 0.0500 time 0.5683 (0.6006) data time 0.0007 (0.0050) model time 0.5676 (0.6108) loss 7.3828 (7.3461) grad_norm 2.1942 (2.5241) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:13:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][120/625] eta 0:05:02 lr 0.000618 wd 0.0500 time 0.5695 (0.5999) data time 0.0006 (0.0047) model time 0.5688 (0.6080) loss 7.8535 (7.3200) grad_norm 2.2823 (2.4886) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:13:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][130/625] eta 0:04:56 lr 0.000618 wd 0.0500 time 0.5721 (0.5993) data time 0.0006 (0.0044) model time 0.5715 (0.6059) loss 8.1777 (7.3507) grad_norm 1.7389 (2.4615) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:13:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][140/625] eta 0:04:49 lr 0.000618 wd 0.0500 time 0.5636 (0.5978) data time 0.0006 (0.0041) model time 0.5630 (0.6026) loss 6.9974 (7.3566) grad_norm 2.1814 (2.4496) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:13:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][150/625] eta 0:04:43 lr 0.000618 wd 0.0500 time 0.5697 (0.5964) data time 0.0008 (0.0039) model time 0.5689 (0.6000) loss 7.8893 (7.3680) grad_norm 2.2210 (2.4437) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:13:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][160/625] eta 0:04:36 lr 0.000618 wd 0.0500 time 0.5694 (0.5951) data time 0.0008 (0.0037) model time 0.5687 (0.5977) loss 8.3020 (7.3835) grad_norm 1.7799 (2.4482) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:13:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][170/625] eta 0:04:30 lr 0.000618 wd 0.0500 time 0.5721 (0.5940) data time 0.0006 (0.0035) model time 0.5715 (0.5959) loss 6.0036 (7.3601) grad_norm 2.4422 (2.4447) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:14:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][180/625] eta 0:04:23 lr 0.000617 wd 0.0500 time 0.5768 (0.5931) data time 0.0006 (0.0034) model time 0.5762 (0.5944) loss 6.5324 (7.3684) grad_norm 2.3663 (2.4599) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:14:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][190/625] eta 0:04:17 lr 0.000617 wd 0.0500 time 0.5706 (0.5921) data time 0.0008 (0.0033) model time 0.5699 (0.5929) loss 7.5732 (7.3499) grad_norm 3.3872 (2.4706) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:14:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][200/625] eta 0:04:11 lr 0.000617 wd 0.0500 time 0.5752 (0.5912) data time 0.0008 (0.0031) model time 0.5745 (0.5916) loss 6.6755 (7.3449) grad_norm 2.1637 (2.4714) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:14:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][210/625] eta 0:04:05 lr 0.000617 wd 0.0500 time 0.5751 (0.5904) data time 0.0008 (0.0030) model time 0.5744 (0.5905) loss 8.3577 (7.3477) grad_norm 3.5492 (2.4645) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:14:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][220/625] eta 0:03:58 lr 0.000617 wd 0.0500 time 0.5838 (0.5899) data time 0.0009 (0.0029) model time 0.5829 (0.5897) loss 7.3797 (7.3646) grad_norm 1.9432 (2.4596) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:14:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][230/625] eta 0:03:52 lr 0.000617 wd 0.0500 time 0.5710 (0.5895) data time 0.0008 (0.0028) model time 0.5703 (0.5892) loss 7.6287 (7.3616) grad_norm 3.3398 (2.4522) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:14:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][240/625] eta 0:03:46 lr 0.000617 wd 0.0500 time 0.5690 (0.5892) data time 0.0007 (0.0027) model time 0.5682 (0.5888) loss 6.3730 (7.3707) grad_norm 3.3641 (2.4569) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:14:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][250/625] eta 0:03:40 lr 0.000617 wd 0.0500 time 0.5718 (0.5887) data time 0.0006 (0.0027) model time 0.5713 (0.5881) loss 6.5205 (7.3803) grad_norm 1.6778 (2.4570) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:14:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][260/625] eta 0:03:34 lr 0.000617 wd 0.0500 time 0.5685 (0.5882) data time 0.0007 (0.0026) model time 0.5679 (0.5875) loss 8.3683 (7.3821) grad_norm 1.8088 (2.4479) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:14:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][270/625] eta 0:03:28 lr 0.000616 wd 0.0500 time 0.5711 (0.5877) data time 0.0008 (0.0025) model time 0.5703 (0.5869) loss 6.0832 (7.3853) grad_norm 1.9521 (2.4383) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:15:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][280/625] eta 0:03:22 lr 0.000616 wd 0.0500 time 0.5741 (0.5872) data time 0.0006 (0.0025) model time 0.5736 (0.5863) loss 6.4107 (7.3837) grad_norm 5.7947 (2.4422) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:15:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][290/625] eta 0:03:16 lr 0.000616 wd 0.0500 time 0.5726 (0.5868) data time 0.0006 (0.0024) model time 0.5720 (0.5858) loss 6.7206 (7.3927) grad_norm 1.7539 (2.4377) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:15:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][300/625] eta 0:03:10 lr 0.000616 wd 0.0500 time 0.5698 (0.5868) data time 0.0006 (0.0024) model time 0.5692 (0.5858) loss 9.1511 (7.3942) grad_norm 2.7815 (2.4280) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:15:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][310/625] eta 0:03:04 lr 0.000616 wd 0.0500 time 0.5732 (0.5872) data time 0.0006 (0.0023) model time 0.5726 (0.5863) loss 7.2363 (7.3980) grad_norm 2.0347 (2.4167) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:15:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][320/625] eta 0:02:59 lr 0.000616 wd 0.0500 time 0.7220 (0.5883) data time 0.0008 (0.0023) model time 0.7213 (0.5876) loss 7.4371 (7.4052) grad_norm 1.7986 (2.4147) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:15:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][330/625] eta 0:02:53 lr 0.000616 wd 0.0500 time 0.5733 (0.5890) data time 0.0008 (0.0022) model time 0.5725 (0.5884) loss 8.0520 (7.4146) grad_norm 1.9055 (2.4055) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:15:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][340/625] eta 0:02:48 lr 0.000616 wd 0.0500 time 0.6077 (0.5896) data time 0.0009 (0.0022) model time 0.6069 (0.5891) loss 6.3257 (7.4137) grad_norm 2.8693 (2.4219) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:15:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][350/625] eta 0:02:42 lr 0.000616 wd 0.0500 time 0.5689 (0.5892) data time 0.0008 (0.0021) model time 0.5681 (0.5886) loss 8.0539 (7.4187) grad_norm 1.9501 (2.4193) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:15:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][360/625] eta 0:02:36 lr 0.000615 wd 0.0500 time 0.5712 (0.5887) data time 0.0006 (0.0021) model time 0.5706 (0.5881) loss 7.8728 (7.4160) grad_norm 2.8609 (2.4120) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:15:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][370/625] eta 0:02:30 lr 0.000615 wd 0.0500 time 0.5743 (0.5883) data time 0.0006 (0.0021) model time 0.5737 (0.5876) loss 8.4967 (7.4198) grad_norm 1.6615 (2.4119) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:16:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][380/625] eta 0:02:24 lr 0.000615 wd 0.0500 time 0.5715 (0.5880) data time 0.0008 (0.0020) model time 0.5707 (0.5872) loss 6.5040 (7.4147) grad_norm 4.7546 (2.4220) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:16:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][390/625] eta 0:02:18 lr 0.000615 wd 0.0500 time 0.5725 (0.5876) data time 0.0008 (0.0020) model time 0.5717 (0.5868) loss 6.0550 (7.4172) grad_norm 1.8624 (2.4290) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:16:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][400/625] eta 0:02:12 lr 0.000615 wd 0.0500 time 0.5730 (0.5873) data time 0.0006 (0.0020) model time 0.5724 (0.5865) loss 7.0449 (7.4164) grad_norm 1.8458 (2.4196) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:16:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][410/625] eta 0:02:06 lr 0.000615 wd 0.0500 time 0.5692 (0.5870) data time 0.0006 (0.0019) model time 0.5686 (0.5861) loss 6.2809 (7.4124) grad_norm 1.5994 (2.4126) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:16:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][420/625] eta 0:02:00 lr 0.000615 wd 0.0500 time 0.5748 (0.5868) data time 0.0008 (0.0019) model time 0.5740 (0.5858) loss 6.7955 (7.4060) grad_norm 1.5285 (2.4068) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:16:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][430/625] eta 0:01:54 lr 0.000615 wd 0.0500 time 0.5737 (0.5865) data time 0.0008 (0.0019) model time 0.5729 (0.5855) loss 7.7004 (7.4101) grad_norm 2.2554 (2.4023) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:16:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][440/625] eta 0:01:48 lr 0.000615 wd 0.0500 time 0.5723 (0.5862) data time 0.0008 (0.0019) model time 0.5716 (0.5852) loss 5.9751 (7.4018) grad_norm 2.9665 (2.4019) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:16:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][450/625] eta 0:01:42 lr 0.000615 wd 0.0500 time 0.5738 (0.5862) data time 0.0006 (0.0019) model time 0.5733 (0.5851) loss 8.8593 (7.3952) grad_norm 2.2264 (2.3983) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:16:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][460/625] eta 0:01:36 lr 0.000614 wd 0.0500 time 0.5705 (0.5859) data time 0.0008 (0.0018) model time 0.5697 (0.5849) loss 6.6675 (7.3902) grad_norm 4.8507 (2.4048) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:16:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][470/625] eta 0:01:30 lr 0.000614 wd 0.0500 time 0.5718 (0.5857) data time 0.0006 (0.0018) model time 0.5712 (0.5846) loss 7.5018 (7.3855) grad_norm 2.1979 (2.4058) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:16:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][480/625] eta 0:01:24 lr 0.000614 wd 0.0500 time 0.5737 (0.5855) data time 0.0007 (0.0018) model time 0.5730 (0.5844) loss 7.0308 (7.3866) grad_norm 3.0156 (2.4002) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:17:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][490/625] eta 0:01:19 lr 0.000614 wd 0.0500 time 0.5610 (0.5853) data time 0.0006 (0.0018) model time 0.5605 (0.5842) loss 8.6191 (7.3875) grad_norm 2.4608 (2.3989) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:17:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][500/625] eta 0:01:13 lr 0.000614 wd 0.0500 time 0.5745 (0.5851) data time 0.0006 (0.0017) model time 0.5738 (0.5839) loss 8.9284 (7.3829) grad_norm 1.6373 (2.3976) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:17:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][510/625] eta 0:01:07 lr 0.000614 wd 0.0500 time 0.5654 (0.5849) data time 0.0007 (0.0017) model time 0.5647 (0.5837) loss 8.1662 (7.3875) grad_norm 2.3149 (2.3963) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:17:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][520/625] eta 0:01:01 lr 0.000614 wd 0.0500 time 0.5722 (0.5849) data time 0.0008 (0.0017) model time 0.5714 (0.5838) loss 7.1214 (7.3840) grad_norm 4.6610 (2.3978) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:17:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][530/625] eta 0:00:55 lr 0.000614 wd 0.0500 time 0.5685 (0.5856) data time 0.0008 (0.0017) model time 0.5678 (0.5845) loss 6.9428 (7.3874) grad_norm 2.1570 (2.3935) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:17:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][540/625] eta 0:00:49 lr 0.000614 wd 0.0500 time 0.7075 (0.5867) data time 0.0008 (0.0017) model time 0.7067 (0.5857) loss 8.8326 (7.3895) grad_norm 2.2865 (2.3960) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:17:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][550/625] eta 0:00:44 lr 0.000613 wd 0.0500 time 0.6984 (0.5877) data time 0.0008 (0.0017) model time 0.6976 (0.5869) loss 8.1649 (7.3945) grad_norm 2.3459 (2.3903) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:17:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][560/625] eta 0:00:38 lr 0.000613 wd 0.0500 time 0.6839 (0.5880) data time 0.0009 (0.0016) model time 0.6831 (0.5872) loss 6.5444 (7.3950) grad_norm 2.9646 (2.3900) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:17:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][570/625] eta 0:00:32 lr 0.000613 wd 0.0500 time 0.5727 (0.5877) data time 0.0008 (0.0016) model time 0.5720 (0.5869) loss 8.5818 (7.3993) grad_norm 1.7332 (2.4030) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:17:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][580/625] eta 0:00:26 lr 0.000613 wd 0.0500 time 0.5742 (0.5875) data time 0.0008 (0.0016) model time 0.5734 (0.5867) loss 8.3634 (7.4069) grad_norm 2.4235 (2.4056) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:18:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][590/625] eta 0:00:20 lr 0.000613 wd 0.0500 time 0.5725 (0.5873) data time 0.0006 (0.0016) model time 0.5718 (0.5864) loss 8.1157 (7.4138) grad_norm 1.8939 (2.4017) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:18:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][600/625] eta 0:00:14 lr 0.000613 wd 0.0500 time 0.5705 (0.5871) data time 0.0008 (0.0016) model time 0.5697 (0.5862) loss 6.6537 (7.4176) grad_norm 2.1016 (2.4021) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:18:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][610/625] eta 0:00:08 lr 0.000613 wd 0.0500 time 0.5726 (0.5869) data time 0.0004 (0.0016) model time 0.5722 (0.5860) loss 7.5351 (7.4225) grad_norm 3.0002 (2.3979) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:18:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [158/300][620/625] eta 0:00:02 lr 0.000613 wd 0.0500 time 0.5738 (0.5867) data time 0.0006 (0.0016) model time 0.5732 (0.5858) loss 7.1195 (7.4239) grad_norm 3.0937 (2.3953) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:18:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 158 training takes 0:06:06 +[2024-07-25 06:18:23 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 06:18:25 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 06:18:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.468 (0.468) Loss 0.5181 (0.5181) Acc@1 89.160 (89.160) Acc@5 98.584 (98.584) Mem 22339MB +[2024-07-25 06:18:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.8164 (0.6472) Acc@1 80.859 (86.195) Acc@5 96.436 (97.732) Mem 22339MB +[2024-07-25 06:18:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9194 (0.7547) Acc@1 77.734 (83.068) Acc@5 95.410 (96.661) Mem 22339MB +[2024-07-25 06:18:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.740 Acc@5 96.631 +[2024-07-25 06:18:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.7% +[2024-07-25 06:18:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.788 (0.788) Loss 0.4929 (0.4929) Acc@1 89.795 (89.795) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-25 06:18:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.186) Loss 0.7676 (0.6228) Acc@1 81.738 (86.741) Acc@5 96.484 (97.829) Mem 22339MB +[2024-07-25 06:18:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.157) Loss 0.8975 (0.7245) Acc@1 78.125 (83.612) Acc@5 95.557 (96.838) Mem 22339MB +[2024-07-25 06:18:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.237 Acc@5 96.837 +[2024-07-25 06:18:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.2% +[2024-07-25 06:18:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][0/625] eta 0:14:58 lr 0.000613 wd 0.0500 time 1.4378 (1.4378) data time 0.7093 (0.7093) model time 0.0000 (0.0000) loss 6.1049 (6.1049) grad_norm 1.8230 (1.8230) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:18:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][10/625] eta 0:06:44 lr 0.000613 wd 0.0500 time 0.5769 (0.6575) data time 0.0008 (0.0652) model time 0.0000 (0.0000) loss 7.5552 (7.1257) grad_norm 2.3595 (2.5153) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:18:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][20/625] eta 0:06:14 lr 0.000612 wd 0.0500 time 0.5789 (0.6182) data time 0.0006 (0.0346) model time 0.0000 (0.0000) loss 7.7906 (7.3616) grad_norm 2.3273 (2.4885) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:18:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][30/625] eta 0:05:59 lr 0.000612 wd 0.0500 time 0.5755 (0.6042) data time 0.0010 (0.0239) model time 0.0000 (0.0000) loss 7.7036 (7.4694) grad_norm 2.3509 (2.5704) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:18:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][40/625] eta 0:05:49 lr 0.000612 wd 0.0500 time 0.5751 (0.5978) data time 0.0008 (0.0183) model time 0.0000 (0.0000) loss 5.5038 (7.4088) grad_norm 2.6877 (2.6227) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:19:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][50/625] eta 0:05:41 lr 0.000612 wd 0.0500 time 0.5772 (0.5934) data time 0.0006 (0.0148) model time 0.0000 (0.0000) loss 8.2670 (7.4333) grad_norm 2.4376 (2.5413) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:19:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][60/625] eta 0:05:33 lr 0.000612 wd 0.0500 time 0.5812 (0.5905) data time 0.0005 (0.0126) model time 0.5806 (0.5745) loss 5.9880 (7.4788) grad_norm 1.8457 (2.4647) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:19:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][70/625] eta 0:05:26 lr 0.000612 wd 0.0500 time 0.5737 (0.5882) data time 0.0006 (0.0109) model time 0.5731 (0.5741) loss 6.7965 (7.4626) grad_norm 1.5581 (2.3920) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:19:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][80/625] eta 0:05:19 lr 0.000612 wd 0.0500 time 0.5754 (0.5865) data time 0.0008 (0.0097) model time 0.5746 (0.5739) loss 9.0814 (7.4749) grad_norm 2.6555 (2.3525) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:19:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][90/625] eta 0:05:13 lr 0.000612 wd 0.0500 time 0.5764 (0.5852) data time 0.0008 (0.0087) model time 0.5756 (0.5740) loss 8.0683 (7.5281) grad_norm 2.4797 (2.3634) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:19:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][100/625] eta 0:05:06 lr 0.000612 wd 0.0500 time 0.5752 (0.5841) data time 0.0009 (0.0079) model time 0.5743 (0.5738) loss 7.0166 (7.5309) grad_norm 2.1938 (2.3594) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:19:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][110/625] eta 0:05:00 lr 0.000611 wd 0.0500 time 0.5746 (0.5840) data time 0.0008 (0.0073) model time 0.5737 (0.5751) loss 7.4187 (7.5314) grad_norm 1.5350 (2.3496) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:19:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][120/625] eta 0:04:54 lr 0.000611 wd 0.0500 time 0.6693 (0.5840) data time 0.0008 (0.0067) model time 0.6684 (0.5763) loss 8.3395 (7.5545) grad_norm 2.2422 (2.3535) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:19:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][130/625] eta 0:04:50 lr 0.000611 wd 0.0500 time 0.5756 (0.5859) data time 0.0006 (0.0063) model time 0.5750 (0.5802) loss 6.9601 (7.5525) grad_norm 2.5703 (2.3552) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:19:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][140/625] eta 0:04:47 lr 0.000611 wd 0.0500 time 0.5729 (0.5920) data time 0.0007 (0.0059) model time 0.5722 (0.5903) loss 7.2241 (7.5278) grad_norm 2.3183 (2.3663) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:20:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][150/625] eta 0:04:42 lr 0.000611 wd 0.0500 time 0.5742 (0.5949) data time 0.0008 (0.0056) model time 0.5734 (0.5949) loss 6.3445 (7.5158) grad_norm 1.5394 (2.3537) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:20:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][160/625] eta 0:04:36 lr 0.000611 wd 0.0500 time 0.5799 (0.5951) data time 0.0008 (0.0053) model time 0.5792 (0.5951) loss 7.5956 (7.5235) grad_norm 3.4754 (2.4049) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:20:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][170/625] eta 0:04:30 lr 0.000611 wd 0.0500 time 0.5772 (0.5940) data time 0.0009 (0.0050) model time 0.5763 (0.5934) loss 6.8800 (7.5362) grad_norm 2.2551 (2.4044) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:20:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][180/625] eta 0:04:23 lr 0.000611 wd 0.0500 time 0.5763 (0.5930) data time 0.0008 (0.0048) model time 0.5755 (0.5920) loss 7.0496 (7.5056) grad_norm 2.1969 (2.3912) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:20:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][190/625] eta 0:04:17 lr 0.000611 wd 0.0500 time 0.5806 (0.5925) data time 0.0006 (0.0046) model time 0.5800 (0.5913) loss 8.5412 (7.5297) grad_norm 1.9328 (2.3963) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:20:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][200/625] eta 0:04:11 lr 0.000611 wd 0.0500 time 0.5898 (0.5917) data time 0.0009 (0.0044) model time 0.5889 (0.5903) loss 8.3912 (7.5132) grad_norm 1.8495 (2.3734) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:20:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][210/625] eta 0:04:05 lr 0.000610 wd 0.0500 time 0.5763 (0.5909) data time 0.0008 (0.0042) model time 0.5755 (0.5892) loss 8.5224 (7.5150) grad_norm 1.8640 (2.3662) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:20:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][220/625] eta 0:03:59 lr 0.000610 wd 0.0500 time 0.5762 (0.5902) data time 0.0007 (0.0041) model time 0.5754 (0.5884) loss 8.6548 (7.5248) grad_norm 1.5527 (2.3516) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:20:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][230/625] eta 0:03:52 lr 0.000610 wd 0.0500 time 0.5746 (0.5898) data time 0.0009 (0.0039) model time 0.5737 (0.5880) loss 7.8191 (7.5030) grad_norm 2.2726 (2.3391) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:20:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][240/625] eta 0:03:46 lr 0.000610 wd 0.0500 time 0.5728 (0.5893) data time 0.0006 (0.0038) model time 0.5722 (0.5874) loss 7.6637 (7.4865) grad_norm 6.5437 (2.3671) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:20:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][250/625] eta 0:03:40 lr 0.000610 wd 0.0500 time 0.5756 (0.5888) data time 0.0006 (0.0037) model time 0.5751 (0.5868) loss 6.9352 (7.4771) grad_norm 2.2419 (2.3940) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:21:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][260/625] eta 0:03:34 lr 0.000610 wd 0.0500 time 0.5846 (0.5883) data time 0.0007 (0.0036) model time 0.5839 (0.5862) loss 7.4197 (7.4867) grad_norm 2.3423 (2.4080) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:21:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][270/625] eta 0:03:28 lr 0.000610 wd 0.0500 time 0.5742 (0.5877) data time 0.0006 (0.0034) model time 0.5736 (0.5856) loss 8.1332 (7.4857) grad_norm 1.7184 (2.3950) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:21:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][280/625] eta 0:03:22 lr 0.000610 wd 0.0500 time 0.5833 (0.5873) data time 0.0008 (0.0034) model time 0.5825 (0.5852) loss 8.8319 (7.4920) grad_norm 1.7007 (2.3752) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:21:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][290/625] eta 0:03:16 lr 0.000610 wd 0.0500 time 0.5730 (0.5871) data time 0.0006 (0.0033) model time 0.5724 (0.5849) loss 6.1344 (7.4882) grad_norm 2.3541 (2.3694) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:21:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][300/625] eta 0:03:10 lr 0.000609 wd 0.0500 time 0.5883 (0.5867) data time 0.0006 (0.0032) model time 0.5877 (0.5845) loss 7.7360 (7.4995) grad_norm 2.1333 (2.3670) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:21:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][310/625] eta 0:03:04 lr 0.000609 wd 0.0500 time 0.5757 (0.5863) data time 0.0006 (0.0031) model time 0.5751 (0.5841) loss 7.6977 (7.5020) grad_norm 3.5078 (2.3891) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:21:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][320/625] eta 0:02:58 lr 0.000609 wd 0.0500 time 0.5719 (0.5860) data time 0.0009 (0.0030) model time 0.5710 (0.5838) loss 6.2785 (7.4920) grad_norm 2.7032 (2.3864) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:21:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][330/625] eta 0:02:52 lr 0.000609 wd 0.0500 time 0.5745 (0.5858) data time 0.0006 (0.0030) model time 0.5739 (0.5837) loss 7.5542 (7.4891) grad_norm 1.9029 (2.3858) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:21:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][340/625] eta 0:02:47 lr 0.000609 wd 0.0500 time 0.6946 (0.5864) data time 0.0006 (0.0029) model time 0.6940 (0.5843) loss 7.0701 (7.4771) grad_norm 2.0104 (2.3749) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:21:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][350/625] eta 0:02:41 lr 0.000609 wd 0.0500 time 0.5755 (0.5869) data time 0.0006 (0.0028) model time 0.5749 (0.5849) loss 8.8239 (7.4736) grad_norm 2.1772 (2.3729) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:22:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][360/625] eta 0:02:36 lr 0.000609 wd 0.0500 time 0.6248 (0.5887) data time 0.0006 (0.0028) model time 0.6242 (0.5871) loss 7.7361 (7.4734) grad_norm 3.1210 (2.3861) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:22:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][370/625] eta 0:02:30 lr 0.000609 wd 0.0500 time 0.5713 (0.5899) data time 0.0007 (0.0027) model time 0.5706 (0.5885) loss 7.7481 (7.4604) grad_norm 2.0781 (2.3772) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:22:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][380/625] eta 0:02:24 lr 0.000609 wd 0.0500 time 0.5761 (0.5898) data time 0.0006 (0.0027) model time 0.5755 (0.5885) loss 8.2573 (7.4561) grad_norm 3.7806 (2.3935) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:22:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][390/625] eta 0:02:18 lr 0.000609 wd 0.0500 time 0.5768 (0.5895) data time 0.0008 (0.0026) model time 0.5760 (0.5881) loss 6.2514 (7.4465) grad_norm 2.4345 (2.3906) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:22:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][400/625] eta 0:02:12 lr 0.000608 wd 0.0500 time 0.5871 (0.5892) data time 0.0006 (0.0026) model time 0.5865 (0.5878) loss 6.7553 (7.4484) grad_norm 2.0987 (2.3879) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:22:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][410/625] eta 0:02:06 lr 0.000608 wd 0.0500 time 0.5750 (0.5888) data time 0.0007 (0.0025) model time 0.5743 (0.5874) loss 6.2561 (7.4418) grad_norm 2.4257 (2.3810) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:22:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][420/625] eta 0:02:00 lr 0.000608 wd 0.0500 time 0.5744 (0.5885) data time 0.0008 (0.0025) model time 0.5737 (0.5870) loss 8.2293 (7.4385) grad_norm 1.8573 (2.3804) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:22:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][430/625] eta 0:01:54 lr 0.000608 wd 0.0500 time 0.5748 (0.5882) data time 0.0006 (0.0025) model time 0.5742 (0.5867) loss 9.1941 (7.4426) grad_norm 1.9701 (2.3729) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:22:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][440/625] eta 0:01:48 lr 0.000608 wd 0.0500 time 0.5792 (0.5880) data time 0.0006 (0.0024) model time 0.5785 (0.5865) loss 6.4266 (7.4402) grad_norm 2.8196 (2.3693) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:22:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][450/625] eta 0:01:42 lr 0.000608 wd 0.0500 time 0.5732 (0.5880) data time 0.0006 (0.0024) model time 0.5726 (0.5865) loss 6.6105 (7.4391) grad_norm 2.8869 (2.3818) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:23:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][460/625] eta 0:01:36 lr 0.000608 wd 0.0500 time 0.5754 (0.5877) data time 0.0008 (0.0024) model time 0.5747 (0.5862) loss 7.6553 (7.4369) grad_norm 2.6708 (2.3805) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:23:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][470/625] eta 0:01:31 lr 0.000608 wd 0.0500 time 0.5856 (0.5875) data time 0.0006 (0.0023) model time 0.5850 (0.5860) loss 7.9756 (7.4438) grad_norm 1.7061 (2.3741) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:23:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][480/625] eta 0:01:25 lr 0.000608 wd 0.0500 time 0.5775 (0.5872) data time 0.0008 (0.0023) model time 0.5767 (0.5857) loss 6.7959 (7.4496) grad_norm 1.6276 (2.3642) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:23:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][490/625] eta 0:01:19 lr 0.000607 wd 0.0500 time 0.5750 (0.5869) data time 0.0008 (0.0023) model time 0.5743 (0.5854) loss 7.9684 (7.4480) grad_norm 1.9796 (2.3621) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:23:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][500/625] eta 0:01:13 lr 0.000607 wd 0.0500 time 0.5748 (0.5867) data time 0.0006 (0.0022) model time 0.5742 (0.5852) loss 5.3603 (7.4321) grad_norm 1.8854 (2.3551) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:23:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][510/625] eta 0:01:07 lr 0.000607 wd 0.0500 time 0.5750 (0.5865) data time 0.0008 (0.0022) model time 0.5743 (0.5849) loss 8.4017 (7.4373) grad_norm 2.4341 (2.3548) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:23:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][520/625] eta 0:01:01 lr 0.000607 wd 0.0500 time 0.5761 (0.5863) data time 0.0006 (0.0022) model time 0.5755 (0.5847) loss 7.7986 (7.4368) grad_norm 1.8983 (2.3523) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:23:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][530/625] eta 0:00:55 lr 0.000607 wd 0.0500 time 0.5745 (0.5860) data time 0.0008 (0.0022) model time 0.5738 (0.5845) loss 8.7025 (7.4404) grad_norm 2.3393 (2.3474) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:23:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][540/625] eta 0:00:49 lr 0.000607 wd 0.0500 time 0.5769 (0.5858) data time 0.0009 (0.0021) model time 0.5760 (0.5842) loss 9.2118 (7.4390) grad_norm 2.0057 (2.3447) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:23:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][550/625] eta 0:00:43 lr 0.000607 wd 0.0500 time 0.7430 (0.5859) data time 0.0007 (0.0021) model time 0.7423 (0.5844) loss 6.4614 (7.4374) grad_norm 3.4759 (2.3446) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:24:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][560/625] eta 0:00:38 lr 0.000607 wd 0.0500 time 0.7063 (0.5863) data time 0.0006 (0.0021) model time 0.7057 (0.5848) loss 7.5024 (7.4431) grad_norm 3.2133 (2.3478) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:24:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][570/625] eta 0:00:32 lr 0.000607 wd 0.0500 time 0.7054 (0.5867) data time 0.0008 (0.0021) model time 0.7047 (0.5852) loss 8.5214 (7.4488) grad_norm 2.2648 (2.3446) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:24:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][580/625] eta 0:00:26 lr 0.000606 wd 0.0500 time 0.7241 (0.5880) data time 0.0008 (0.0020) model time 0.7233 (0.5866) loss 6.5987 (7.4439) grad_norm 3.7360 (2.3570) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:24:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][590/625] eta 0:00:20 lr 0.000606 wd 0.0500 time 0.5719 (0.5889) data time 0.0008 (0.0020) model time 0.5711 (0.5877) loss 7.8283 (7.4462) grad_norm 2.7954 (2.3587) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:24:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][600/625] eta 0:00:14 lr 0.000606 wd 0.0500 time 0.5795 (0.5890) data time 0.0006 (0.0020) model time 0.5789 (0.5877) loss 6.4904 (7.4483) grad_norm 2.0045 (2.3516) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:24:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][610/625] eta 0:00:08 lr 0.000606 wd 0.0500 time 0.5779 (0.5887) data time 0.0006 (0.0020) model time 0.5773 (0.5875) loss 7.9810 (7.4368) grad_norm 1.9135 (2.3518) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:24:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [159/300][620/625] eta 0:00:02 lr 0.000606 wd 0.0500 time 0.5762 (0.5885) data time 0.0004 (0.0020) model time 0.5758 (0.5873) loss 6.9148 (7.4389) grad_norm 1.6779 (2.3518) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:24:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 159 training takes 0:06:07 +[2024-07-25 06:24:39 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 06:24:41 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 06:24:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.476 (0.476) Loss 0.4944 (0.4944) Acc@1 90.527 (90.527) Acc@5 98.730 (98.730) Mem 22339MB +[2024-07-25 06:24:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.8369 (0.6369) Acc@1 79.834 (86.368) Acc@5 95.898 (97.732) Mem 22339MB +[2024-07-25 06:24:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9399 (0.7520) Acc@1 76.611 (83.203) Acc@5 95.068 (96.638) Mem 22339MB +[2024-07-25 06:24:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.881 Acc@5 96.639 +[2024-07-25 06:24:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.9% +[2024-07-25 06:24:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 82.88% +[2024-07-25 06:24:44 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 06:24:46 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 06:24:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.473 (0.473) Loss 0.4937 (0.4937) Acc@1 89.795 (89.795) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-25 06:24:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.7676 (0.6228) Acc@1 81.885 (86.772) Acc@5 96.436 (97.820) Mem 22339MB +[2024-07-25 06:24:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8960 (0.7243) Acc@1 78.125 (83.612) Acc@5 95.557 (96.831) Mem 22339MB +[2024-07-25 06:24:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.239 Acc@5 96.835 +[2024-07-25 06:24:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.2% +[2024-07-25 06:24:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][0/625] eta 0:14:47 lr 0.000606 wd 0.0500 time 1.4196 (1.4196) data time 0.5801 (0.5801) model time 0.0000 (0.0000) loss 6.1220 (6.1220) grad_norm 1.8393 (1.8393) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:24:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][10/625] eta 0:06:40 lr 0.000606 wd 0.0500 time 0.5739 (0.6510) data time 0.0008 (0.0535) model time 0.0000 (0.0000) loss 7.4633 (7.0761) grad_norm 2.1327 (2.5799) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:25:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][20/625] eta 0:06:11 lr 0.000606 wd 0.0500 time 0.5760 (0.6141) data time 0.0008 (0.0284) model time 0.0000 (0.0000) loss 8.5151 (7.1059) grad_norm 3.5502 (2.6435) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:25:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][30/625] eta 0:05:57 lr 0.000606 wd 0.0500 time 0.5767 (0.6010) data time 0.0008 (0.0195) model time 0.0000 (0.0000) loss 8.1972 (7.2351) grad_norm 2.4699 (2.8439) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:25:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][40/625] eta 0:05:47 lr 0.000606 wd 0.0500 time 0.5708 (0.5943) data time 0.0008 (0.0149) model time 0.0000 (0.0000) loss 8.9701 (7.3031) grad_norm 2.4787 (2.8618) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:25:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][50/625] eta 0:05:39 lr 0.000605 wd 0.0500 time 0.5749 (0.5903) data time 0.0006 (0.0122) model time 0.0000 (0.0000) loss 6.9765 (7.2707) grad_norm 1.8774 (2.8138) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:25:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][60/625] eta 0:05:32 lr 0.000605 wd 0.0500 time 0.5799 (0.5881) data time 0.0006 (0.0103) model time 0.5793 (0.5760) loss 7.1238 (7.2732) grad_norm 1.8880 (2.8108) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:25:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][70/625] eta 0:05:25 lr 0.000605 wd 0.0500 time 0.5751 (0.5860) data time 0.0008 (0.0090) model time 0.5743 (0.5745) loss 8.7248 (7.3664) grad_norm 1.6471 (2.7390) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:25:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][80/625] eta 0:05:18 lr 0.000605 wd 0.0500 time 0.5759 (0.5847) data time 0.0006 (0.0080) model time 0.5753 (0.5743) loss 7.7678 (7.4250) grad_norm 1.5835 (2.6578) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:25:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][90/625] eta 0:05:12 lr 0.000605 wd 0.0500 time 0.5789 (0.5836) data time 0.0006 (0.0072) model time 0.5783 (0.5742) loss 8.8075 (7.4205) grad_norm 3.0684 (2.6114) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:25:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][100/625] eta 0:05:05 lr 0.000605 wd 0.0500 time 0.5734 (0.5828) data time 0.0006 (0.0065) model time 0.5727 (0.5743) loss 7.2194 (7.4195) grad_norm 2.3623 (2.5760) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:25:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][110/625] eta 0:04:59 lr 0.000605 wd 0.0500 time 0.5767 (0.5820) data time 0.0007 (0.0060) model time 0.5761 (0.5742) loss 7.6624 (7.4311) grad_norm 2.5883 (2.5848) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:26:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][120/625] eta 0:04:53 lr 0.000605 wd 0.0500 time 0.5752 (0.5814) data time 0.0006 (0.0056) model time 0.5745 (0.5741) loss 5.9903 (7.3769) grad_norm 2.3435 (2.5621) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:26:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][130/625] eta 0:04:47 lr 0.000605 wd 0.0500 time 0.5744 (0.5809) data time 0.0008 (0.0052) model time 0.5735 (0.5741) loss 6.1064 (7.3553) grad_norm 3.4309 (2.5519) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:26:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][140/625] eta 0:04:41 lr 0.000605 wd 0.0500 time 0.5858 (0.5807) data time 0.0006 (0.0049) model time 0.5852 (0.5744) loss 6.0727 (7.3512) grad_norm 2.1901 (2.5254) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:26:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][150/625] eta 0:04:35 lr 0.000604 wd 0.0500 time 0.5762 (0.5810) data time 0.0009 (0.0046) model time 0.5752 (0.5755) loss 6.7659 (7.3659) grad_norm 2.7379 (2.5072) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:26:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][160/625] eta 0:04:31 lr 0.000604 wd 0.0500 time 0.7368 (0.5831) data time 0.0008 (0.0044) model time 0.7360 (0.5790) loss 8.1270 (7.3860) grad_norm 1.6599 (2.4853) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:26:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][170/625] eta 0:04:26 lr 0.000604 wd 0.0500 time 0.5727 (0.5860) data time 0.0008 (0.0042) model time 0.5719 (0.5834) loss 7.5944 (7.3881) grad_norm 1.8346 (2.4981) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:26:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][180/625] eta 0:04:23 lr 0.000604 wd 0.0500 time 0.7393 (0.5922) data time 0.0006 (0.0040) model time 0.7387 (0.5921) loss 7.1330 (7.4121) grad_norm 2.2084 (2.4940) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:26:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][190/625] eta 0:04:17 lr 0.000604 wd 0.0500 time 0.5731 (0.5924) data time 0.0008 (0.0038) model time 0.5723 (0.5923) loss 6.4294 (7.4036) grad_norm 1.9743 (2.4723) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:26:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][200/625] eta 0:04:11 lr 0.000604 wd 0.0500 time 0.5745 (0.5921) data time 0.0008 (0.0037) model time 0.5737 (0.5918) loss 6.4262 (7.3978) grad_norm 1.6464 (2.4536) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:26:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][210/625] eta 0:04:05 lr 0.000604 wd 0.0500 time 0.5761 (0.5919) data time 0.0007 (0.0036) model time 0.5755 (0.5916) loss 7.6621 (7.3934) grad_norm 1.8583 (2.4253) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:27:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][220/625] eta 0:03:59 lr 0.000604 wd 0.0500 time 0.5774 (0.5911) data time 0.0007 (0.0034) model time 0.5768 (0.5905) loss 7.8784 (7.3925) grad_norm 1.8177 (2.4152) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:27:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][230/625] eta 0:03:53 lr 0.000604 wd 0.0500 time 0.5755 (0.5903) data time 0.0006 (0.0033) model time 0.5749 (0.5895) loss 8.4528 (7.4077) grad_norm 2.4828 (2.4016) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:27:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][240/625] eta 0:03:47 lr 0.000603 wd 0.0500 time 0.5759 (0.5897) data time 0.0006 (0.0032) model time 0.5753 (0.5887) loss 6.4318 (7.4021) grad_norm 2.1786 (2.3948) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:27:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][250/625] eta 0:03:40 lr 0.000603 wd 0.0500 time 0.5847 (0.5892) data time 0.0008 (0.0032) model time 0.5839 (0.5880) loss 7.8635 (7.4160) grad_norm 1.7771 (2.3960) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:27:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][260/625] eta 0:03:34 lr 0.000603 wd 0.0500 time 0.5771 (0.5888) data time 0.0008 (0.0032) model time 0.5763 (0.5874) loss 7.1564 (7.4095) grad_norm 1.8160 (2.3901) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:27:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][270/625] eta 0:03:28 lr 0.000603 wd 0.0500 time 0.5896 (0.5883) data time 0.0006 (0.0031) model time 0.5889 (0.5869) loss 8.5060 (7.4033) grad_norm 1.5971 (2.3776) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:27:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][280/625] eta 0:03:22 lr 0.000603 wd 0.0500 time 0.5735 (0.5879) data time 0.0008 (0.0030) model time 0.5727 (0.5864) loss 7.5736 (7.4204) grad_norm 2.4056 (2.3770) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:27:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][290/625] eta 0:03:16 lr 0.000603 wd 0.0500 time 0.5784 (0.5874) data time 0.0006 (0.0029) model time 0.5778 (0.5859) loss 8.7210 (7.4205) grad_norm 1.8194 (2.3713) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:27:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][300/625] eta 0:03:10 lr 0.000603 wd 0.0500 time 0.5739 (0.5871) data time 0.0006 (0.0029) model time 0.5733 (0.5855) loss 8.3911 (7.4252) grad_norm 2.1851 (2.3767) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:27:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][310/625] eta 0:03:04 lr 0.000603 wd 0.0500 time 0.5781 (0.5868) data time 0.0008 (0.0028) model time 0.5774 (0.5851) loss 6.5482 (7.4232) grad_norm 1.7696 (2.3773) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:27:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][320/625] eta 0:02:58 lr 0.000603 wd 0.0500 time 0.5738 (0.5864) data time 0.0008 (0.0027) model time 0.5730 (0.5847) loss 7.3483 (7.4287) grad_norm 1.5922 (2.3701) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:28:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][330/625] eta 0:02:52 lr 0.000602 wd 0.0500 time 0.5770 (0.5861) data time 0.0007 (0.0027) model time 0.5763 (0.5844) loss 7.7912 (7.4337) grad_norm 3.4434 (2.3775) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:28:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][340/625] eta 0:02:46 lr 0.000602 wd 0.0500 time 0.5806 (0.5858) data time 0.0006 (0.0026) model time 0.5800 (0.5841) loss 6.1392 (7.4406) grad_norm 2.7518 (2.3964) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:28:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][350/625] eta 0:02:41 lr 0.000602 wd 0.0500 time 0.5797 (0.5855) data time 0.0008 (0.0026) model time 0.5790 (0.5837) loss 8.4777 (7.4417) grad_norm 2.1408 (2.3990) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:28:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][360/625] eta 0:02:35 lr 0.000602 wd 0.0500 time 0.5770 (0.5852) data time 0.0006 (0.0025) model time 0.5763 (0.5834) loss 7.8133 (7.4508) grad_norm 3.2772 (2.4131) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:28:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][370/625] eta 0:02:29 lr 0.000602 wd 0.0500 time 0.5748 (0.5853) data time 0.0006 (0.0025) model time 0.5741 (0.5836) loss 7.5785 (7.4511) grad_norm 1.9213 (2.4372) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:28:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][380/625] eta 0:02:23 lr 0.000602 wd 0.0500 time 0.6259 (0.5857) data time 0.0006 (0.0024) model time 0.6253 (0.5841) loss 6.6160 (7.4492) grad_norm 1.7454 (2.4297) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:28:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][390/625] eta 0:02:17 lr 0.000602 wd 0.0500 time 0.7039 (0.5867) data time 0.0006 (0.0024) model time 0.7033 (0.5853) loss 6.9738 (7.4463) grad_norm 1.9139 (2.4250) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:28:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][400/625] eta 0:02:12 lr 0.000602 wd 0.0500 time 0.5703 (0.5881) data time 0.0008 (0.0023) model time 0.5695 (0.5869) loss 7.3195 (7.4546) grad_norm 2.1846 (2.4192) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:28:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][410/625] eta 0:02:06 lr 0.000602 wd 0.0500 time 0.5727 (0.5889) data time 0.0010 (0.0023) model time 0.5716 (0.5878) loss 8.2034 (7.4615) grad_norm 5.8765 (2.4221) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:28:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][420/625] eta 0:02:00 lr 0.000602 wd 0.0500 time 0.5771 (0.5888) data time 0.0006 (0.0023) model time 0.5765 (0.5877) loss 7.3475 (7.4554) grad_norm 1.8634 (2.4256) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:29:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][430/625] eta 0:01:54 lr 0.000601 wd 0.0500 time 0.5751 (0.5888) data time 0.0006 (0.0022) model time 0.5745 (0.5877) loss 8.0733 (7.4608) grad_norm 4.6175 (2.4240) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:29:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][440/625] eta 0:01:48 lr 0.000601 wd 0.0500 time 0.5747 (0.5886) data time 0.0006 (0.0022) model time 0.5741 (0.5874) loss 5.9547 (7.4675) grad_norm 3.0745 (2.4252) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:29:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][450/625] eta 0:01:42 lr 0.000601 wd 0.0500 time 0.5732 (0.5882) data time 0.0008 (0.0022) model time 0.5724 (0.5871) loss 8.3422 (7.4750) grad_norm 2.6751 (2.4350) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:29:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][460/625] eta 0:01:37 lr 0.000601 wd 0.0500 time 0.5692 (0.5879) data time 0.0007 (0.0021) model time 0.5685 (0.5867) loss 7.5588 (7.4678) grad_norm 1.8860 (2.4588) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:29:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][470/625] eta 0:01:31 lr 0.000601 wd 0.0500 time 0.5756 (0.5877) data time 0.0006 (0.0021) model time 0.5750 (0.5864) loss 7.0863 (7.4725) grad_norm 2.1527 (2.4637) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:29:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][480/625] eta 0:01:25 lr 0.000601 wd 0.0500 time 0.5702 (0.5874) data time 0.0008 (0.0021) model time 0.5694 (0.5861) loss 7.7744 (7.4745) grad_norm 2.4953 (2.4719) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:29:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][490/625] eta 0:01:19 lr 0.000601 wd 0.0500 time 0.5722 (0.5871) data time 0.0007 (0.0021) model time 0.5716 (0.5859) loss 7.6970 (7.4810) grad_norm 2.4007 (2.4725) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:29:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][500/625] eta 0:01:13 lr 0.000601 wd 0.0500 time 0.5755 (0.5869) data time 0.0008 (0.0020) model time 0.5748 (0.5856) loss 8.3523 (7.4837) grad_norm 2.4739 (2.4681) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:29:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][510/625] eta 0:01:07 lr 0.000601 wd 0.0500 time 0.5744 (0.5866) data time 0.0008 (0.0020) model time 0.5736 (0.5853) loss 7.6679 (7.4842) grad_norm 2.7926 (2.4735) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:29:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][520/625] eta 0:01:01 lr 0.000600 wd 0.0500 time 0.5739 (0.5864) data time 0.0006 (0.0020) model time 0.5733 (0.5851) loss 8.1693 (7.4929) grad_norm 3.1107 (2.4704) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:30:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][530/625] eta 0:00:55 lr 0.000600 wd 0.0500 time 0.5738 (0.5862) data time 0.0008 (0.0020) model time 0.5730 (0.5848) loss 7.0537 (7.4873) grad_norm 2.3403 (2.4702) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:30:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][540/625] eta 0:00:49 lr 0.000600 wd 0.0500 time 0.5696 (0.5859) data time 0.0008 (0.0020) model time 0.5688 (0.5846) loss 8.1576 (7.4816) grad_norm 3.2888 (2.4872) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:30:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][550/625] eta 0:00:43 lr 0.000600 wd 0.0500 time 0.5810 (0.5858) data time 0.0006 (0.0019) model time 0.5804 (0.5844) loss 8.4734 (7.4801) grad_norm 2.6019 (2.4858) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:30:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][560/625] eta 0:00:38 lr 0.000600 wd 0.0500 time 0.5751 (0.5856) data time 0.0006 (0.0019) model time 0.5745 (0.5842) loss 6.5530 (7.4784) grad_norm 2.1240 (2.4790) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:30:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][570/625] eta 0:00:32 lr 0.000600 wd 0.0500 time 0.5770 (0.5854) data time 0.0008 (0.0019) model time 0.5762 (0.5840) loss 6.3007 (7.4766) grad_norm 2.7339 (2.4747) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:30:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][580/625] eta 0:00:26 lr 0.000600 wd 0.0500 time 0.5753 (0.5853) data time 0.0008 (0.0019) model time 0.5745 (0.5839) loss 8.2665 (7.4792) grad_norm 1.9826 (2.4694) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:30:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][590/625] eta 0:00:20 lr 0.000600 wd 0.0500 time 0.5733 (0.5852) data time 0.0006 (0.0019) model time 0.5727 (0.5838) loss 7.5560 (7.4787) grad_norm 1.8372 (2.4620) loss_scale 4096.0000 (2058.3959) mem 22339MB +[2024-07-25 06:30:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][600/625] eta 0:00:14 lr 0.000600 wd 0.0500 time 0.6989 (0.5858) data time 0.0006 (0.0019) model time 0.6983 (0.5844) loss 7.6883 (7.4790) grad_norm 2.2890 (2.4659) loss_scale 4096.0000 (2092.2995) mem 22339MB +[2024-07-25 06:30:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][610/625] eta 0:00:08 lr 0.000599 wd 0.0500 time 0.7434 (0.5865) data time 0.0004 (0.0019) model time 0.7430 (0.5853) loss 7.3128 (7.4790) grad_norm 2.0711 (2.4600) loss_scale 4096.0000 (2125.0933) mem 22339MB +[2024-07-25 06:30:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [160/300][620/625] eta 0:00:02 lr 0.000599 wd 0.0500 time 0.5863 (0.5873) data time 0.0006 (0.0018) model time 0.5857 (0.5862) loss 7.2353 (7.4790) grad_norm 1.5825 (2.4542) loss_scale 4096.0000 (2156.8309) mem 22339MB +[2024-07-25 06:30:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 160 training takes 0:06:07 +[2024-07-25 06:30:57 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 06:30:58 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 06:30:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.466 (0.466) Loss 0.5269 (0.5269) Acc@1 88.965 (88.965) Acc@5 98.779 (98.779) Mem 22339MB +[2024-07-25 06:31:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.156) Loss 0.8081 (0.6437) Acc@1 81.201 (86.186) Acc@5 96.143 (97.754) Mem 22339MB +[2024-07-25 06:31:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8940 (0.7496) Acc@1 78.711 (83.282) Acc@5 95.361 (96.649) Mem 22339MB +[2024-07-25 06:31:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.883 Acc@5 96.613 +[2024-07-25 06:31:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.9% +[2024-07-25 06:31:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 82.88% +[2024-07-25 06:31:02 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 06:31:03 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 06:31:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.471 (0.471) Loss 0.4939 (0.4939) Acc@1 89.746 (89.746) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-25 06:31:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.7661 (0.6228) Acc@1 81.885 (86.785) Acc@5 96.387 (97.825) Mem 22339MB +[2024-07-25 06:31:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8955 (0.7241) Acc@1 78.174 (83.652) Acc@5 95.557 (96.833) Mem 22339MB +[2024-07-25 06:31:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.273 Acc@5 96.835 +[2024-07-25 06:31:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.3% +[2024-07-25 06:31:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.27% +[2024-07-25 06:31:07 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 06:31:08 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 06:31:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][0/625] eta 0:09:24 lr 0.000599 wd 0.0500 time 0.9038 (0.9038) data time 0.3843 (0.3843) model time 0.0000 (0.0000) loss 7.4998 (7.4998) grad_norm 2.9117 (2.9117) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 06:31:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][10/625] eta 0:06:22 lr 0.000599 wd 0.0500 time 0.5712 (0.6226) data time 0.0008 (0.0357) model time 0.0000 (0.0000) loss 7.0928 (7.4600) grad_norm 1.9272 (2.1800) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 06:31:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][20/625] eta 0:06:02 lr 0.000599 wd 0.0500 time 0.5698 (0.5990) data time 0.0006 (0.0191) model time 0.0000 (0.0000) loss 6.8582 (7.4010) grad_norm 2.2314 (2.2355) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 06:31:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][30/625] eta 0:05:51 lr 0.000599 wd 0.0500 time 0.5704 (0.5911) data time 0.0006 (0.0132) model time 0.0000 (0.0000) loss 6.3427 (7.3763) grad_norm 2.0684 (2.2830) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 06:31:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][40/625] eta 0:05:43 lr 0.000599 wd 0.0500 time 0.5690 (0.5867) data time 0.0006 (0.0102) model time 0.0000 (0.0000) loss 6.6499 (7.2605) grad_norm 2.2227 (2.3900) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 06:31:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][50/625] eta 0:05:35 lr 0.000599 wd 0.0500 time 0.5719 (0.5840) data time 0.0008 (0.0083) model time 0.0000 (0.0000) loss 5.8713 (7.2486) grad_norm 3.8007 (2.4527) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 06:31:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][60/625] eta 0:05:28 lr 0.000599 wd 0.0500 time 0.5703 (0.5822) data time 0.0008 (0.0071) model time 0.5695 (0.5720) loss 7.8826 (7.3035) grad_norm 2.8751 (2.4887) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 06:31:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][70/625] eta 0:05:22 lr 0.000599 wd 0.0500 time 0.5712 (0.5810) data time 0.0008 (0.0062) model time 0.5703 (0.5724) loss 7.3617 (7.2924) grad_norm 2.4389 (2.5198) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 06:31:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][80/625] eta 0:05:16 lr 0.000598 wd 0.0500 time 0.5730 (0.5800) data time 0.0008 (0.0055) model time 0.5722 (0.5725) loss 7.3924 (7.2360) grad_norm 2.9461 (2.4653) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 06:32:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][90/625] eta 0:05:10 lr 0.000598 wd 0.0500 time 0.5704 (0.5796) data time 0.0008 (0.0050) model time 0.5696 (0.5733) loss 8.8613 (7.2788) grad_norm 1.7836 (2.4805) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 06:32:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][100/625] eta 0:05:04 lr 0.000598 wd 0.0500 time 0.5748 (0.5792) data time 0.0006 (0.0046) model time 0.5742 (0.5734) loss 7.5183 (7.2587) grad_norm 4.8727 (2.4874) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 06:32:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][110/625] eta 0:04:58 lr 0.000598 wd 0.0500 time 0.5723 (0.5787) data time 0.0009 (0.0043) model time 0.5714 (0.5733) loss 7.4226 (7.2501) grad_norm 2.0363 (2.4538) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 06:32:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][120/625] eta 0:04:52 lr 0.000598 wd 0.0500 time 0.5717 (0.5783) data time 0.0006 (0.0040) model time 0.5710 (0.5733) loss 6.3987 (7.2746) grad_norm 2.4188 (2.4344) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 06:32:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][130/625] eta 0:04:46 lr 0.000598 wd 0.0500 time 0.5704 (0.5780) data time 0.0008 (0.0038) model time 0.5695 (0.5733) loss 8.2538 (7.2944) grad_norm 1.9015 (2.4325) loss_scale 4096.0000 (4096.0000) mem 22339MB +[2024-07-25 06:32:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][140/625] eta 0:04:40 lr 0.000598 wd 0.0500 time 0.5665 (0.5777) data time 0.0007 (0.0035) model time 0.5658 (0.5733) loss 8.3721 (7.3117) grad_norm inf (inf) loss_scale 2048.0000 (4081.4752) mem 22339MB +[2024-07-25 06:32:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][150/625] eta 0:04:34 lr 0.000598 wd 0.0500 time 0.5686 (0.5774) data time 0.0008 (0.0034) model time 0.5678 (0.5733) loss 7.8629 (7.3041) grad_norm 1.8670 (inf) loss_scale 2048.0000 (3946.8079) mem 22339MB +[2024-07-25 06:32:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][160/625] eta 0:04:28 lr 0.000598 wd 0.0500 time 0.5706 (0.5772) data time 0.0008 (0.0032) model time 0.5698 (0.5733) loss 6.1396 (7.2896) grad_norm 2.2774 (inf) loss_scale 2048.0000 (3828.8696) mem 22339MB +[2024-07-25 06:32:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][170/625] eta 0:04:23 lr 0.000598 wd 0.0500 time 0.5723 (0.5781) data time 0.0008 (0.0031) model time 0.5715 (0.5749) loss 8.5127 (7.3286) grad_norm 2.2292 (inf) loss_scale 2048.0000 (3724.7251) mem 22339MB +[2024-07-25 06:32:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][180/625] eta 0:04:17 lr 0.000597 wd 0.0500 time 0.5722 (0.5781) data time 0.0007 (0.0029) model time 0.5714 (0.5749) loss 5.9837 (7.3236) grad_norm 1.9623 (inf) loss_scale 2048.0000 (3632.0884) mem 22339MB +[2024-07-25 06:32:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][190/625] eta 0:04:12 lr 0.000597 wd 0.0500 time 0.6906 (0.5795) data time 0.0006 (0.0028) model time 0.6900 (0.5770) loss 8.2722 (7.3321) grad_norm 2.0244 (inf) loss_scale 2048.0000 (3549.1518) mem 22339MB +[2024-07-25 06:33:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][200/625] eta 0:04:07 lr 0.000597 wd 0.0500 time 0.5711 (0.5825) data time 0.0006 (0.0027) model time 0.5705 (0.5812) loss 6.8231 (7.3375) grad_norm 2.0564 (inf) loss_scale 2048.0000 (3474.4677) mem 22339MB +[2024-07-25 06:33:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][210/625] eta 0:04:03 lr 0.000597 wd 0.0500 time 0.7113 (0.5865) data time 0.0008 (0.0026) model time 0.7105 (0.5865) loss 8.0869 (7.3441) grad_norm 1.5531 (inf) loss_scale 2048.0000 (3406.8626) mem 22339MB +[2024-07-25 06:33:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][220/625] eta 0:03:58 lr 0.000597 wd 0.0500 time 0.6289 (0.5885) data time 0.0008 (0.0025) model time 0.6281 (0.5890) loss 6.7837 (7.3432) grad_norm 2.3447 (inf) loss_scale 2048.0000 (3345.3756) mem 22339MB +[2024-07-25 06:33:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][230/625] eta 0:03:52 lr 0.000597 wd 0.0500 time 0.5723 (0.5882) data time 0.0006 (0.0025) model time 0.5717 (0.5886) loss 8.0622 (7.3515) grad_norm 1.8987 (inf) loss_scale 2048.0000 (3289.2121) mem 22339MB +[2024-07-25 06:33:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][240/625] eta 0:03:46 lr 0.000597 wd 0.0500 time 0.5686 (0.5877) data time 0.0008 (0.0024) model time 0.5678 (0.5879) loss 7.0933 (7.3536) grad_norm 3.6085 (inf) loss_scale 2048.0000 (3237.7095) mem 22339MB +[2024-07-25 06:33:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][250/625] eta 0:03:40 lr 0.000597 wd 0.0500 time 0.5716 (0.5872) data time 0.0007 (0.0023) model time 0.5709 (0.5872) loss 8.7899 (7.3503) grad_norm 1.9458 (inf) loss_scale 2048.0000 (3190.3108) mem 22339MB +[2024-07-25 06:33:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][260/625] eta 0:03:34 lr 0.000597 wd 0.0500 time 0.5637 (0.5867) data time 0.0009 (0.0023) model time 0.5628 (0.5866) loss 8.6658 (7.3496) grad_norm 1.4776 (inf) loss_scale 2048.0000 (3146.5441) mem 22339MB +[2024-07-25 06:33:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][270/625] eta 0:03:28 lr 0.000596 wd 0.0500 time 0.5715 (0.5862) data time 0.0006 (0.0022) model time 0.5709 (0.5859) loss 8.6911 (7.3595) grad_norm 2.6747 (inf) loss_scale 2048.0000 (3106.0074) mem 22339MB +[2024-07-25 06:33:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][280/625] eta 0:03:22 lr 0.000596 wd 0.0500 time 0.5753 (0.5858) data time 0.0006 (0.0022) model time 0.5747 (0.5853) loss 8.3898 (7.3764) grad_norm 1.6816 (inf) loss_scale 2048.0000 (3068.3559) mem 22339MB +[2024-07-25 06:33:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][290/625] eta 0:03:16 lr 0.000596 wd 0.0500 time 0.5743 (0.5854) data time 0.0006 (0.0021) model time 0.5737 (0.5849) loss 8.5781 (7.3886) grad_norm 3.4554 (inf) loss_scale 2048.0000 (3033.2921) mem 22339MB +[2024-07-25 06:34:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][300/625] eta 0:03:10 lr 0.000596 wd 0.0500 time 0.5730 (0.5850) data time 0.0006 (0.0021) model time 0.5724 (0.5844) loss 7.8165 (7.3773) grad_norm 2.4605 (inf) loss_scale 2048.0000 (3000.5581) mem 22339MB +[2024-07-25 06:34:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][310/625] eta 0:03:04 lr 0.000596 wd 0.0500 time 0.5748 (0.5847) data time 0.0006 (0.0021) model time 0.5741 (0.5840) loss 5.8009 (7.3683) grad_norm 2.6588 (inf) loss_scale 2048.0000 (2969.9293) mem 22339MB +[2024-07-25 06:34:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][320/625] eta 0:02:58 lr 0.000596 wd 0.0500 time 0.5752 (0.5844) data time 0.0008 (0.0020) model time 0.5744 (0.5837) loss 6.8177 (7.3663) grad_norm 2.1229 (inf) loss_scale 2048.0000 (2941.2087) mem 22339MB +[2024-07-25 06:34:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][330/625] eta 0:02:52 lr 0.000596 wd 0.0500 time 0.5741 (0.5842) data time 0.0006 (0.0020) model time 0.5734 (0.5834) loss 8.2371 (7.3721) grad_norm 1.6375 (inf) loss_scale 2048.0000 (2914.2236) mem 22339MB +[2024-07-25 06:34:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][340/625] eta 0:02:46 lr 0.000596 wd 0.0500 time 0.5749 (0.5839) data time 0.0006 (0.0019) model time 0.5743 (0.5831) loss 7.4761 (7.3769) grad_norm 1.8568 (inf) loss_scale 2048.0000 (2888.8211) mem 22339MB +[2024-07-25 06:34:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][350/625] eta 0:02:40 lr 0.000596 wd 0.0500 time 0.5744 (0.5837) data time 0.0006 (0.0019) model time 0.5738 (0.5828) loss 6.9427 (7.3860) grad_norm 2.3708 (inf) loss_scale 2048.0000 (2864.8661) mem 22339MB +[2024-07-25 06:34:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][360/625] eta 0:02:34 lr 0.000595 wd 0.0500 time 0.5728 (0.5834) data time 0.0006 (0.0019) model time 0.5722 (0.5825) loss 7.8830 (7.3933) grad_norm 1.8138 (inf) loss_scale 2048.0000 (2842.2382) mem 22339MB +[2024-07-25 06:34:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][370/625] eta 0:02:28 lr 0.000595 wd 0.0500 time 0.5735 (0.5832) data time 0.0008 (0.0019) model time 0.5727 (0.5823) loss 7.5613 (7.3950) grad_norm 3.1222 (inf) loss_scale 2048.0000 (2820.8302) mem 22339MB +[2024-07-25 06:34:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][380/625] eta 0:02:22 lr 0.000595 wd 0.0500 time 0.5726 (0.5830) data time 0.0006 (0.0018) model time 0.5720 (0.5821) loss 7.0576 (7.3923) grad_norm 2.2110 (inf) loss_scale 2048.0000 (2800.5459) mem 22339MB +[2024-07-25 06:34:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][390/625] eta 0:02:17 lr 0.000595 wd 0.0500 time 0.5701 (0.5830) data time 0.0008 (0.0018) model time 0.5693 (0.5820) loss 7.1803 (7.3947) grad_norm 1.8054 (inf) loss_scale 2048.0000 (2781.2992) mem 22339MB +[2024-07-25 06:35:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][400/625] eta 0:02:11 lr 0.000595 wd 0.0500 time 0.5736 (0.5828) data time 0.0006 (0.0018) model time 0.5730 (0.5818) loss 7.9252 (7.3923) grad_norm 3.8415 (inf) loss_scale 2048.0000 (2763.0125) mem 22339MB +[2024-07-25 06:35:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][410/625] eta 0:02:05 lr 0.000595 wd 0.0500 time 0.7289 (0.5836) data time 0.0008 (0.0017) model time 0.7281 (0.5828) loss 7.3881 (7.3993) grad_norm 2.1071 (inf) loss_scale 2048.0000 (2745.6156) mem 22339MB +[2024-07-25 06:35:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][420/625] eta 0:01:59 lr 0.000595 wd 0.0500 time 0.5705 (0.5844) data time 0.0008 (0.0017) model time 0.5696 (0.5837) loss 6.8114 (7.4017) grad_norm 2.3181 (inf) loss_scale 2048.0000 (2729.0451) mem 22339MB +[2024-07-25 06:35:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][430/625] eta 0:01:54 lr 0.000595 wd 0.0500 time 0.6979 (0.5859) data time 0.0008 (0.0017) model time 0.6972 (0.5853) loss 8.1016 (7.4117) grad_norm 3.2673 (inf) loss_scale 2048.0000 (2713.2436) mem 22339MB +[2024-07-25 06:35:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][440/625] eta 0:01:48 lr 0.000595 wd 0.0500 time 0.6927 (0.5862) data time 0.0008 (0.0017) model time 0.6919 (0.5857) loss 6.4179 (7.4015) grad_norm 2.0014 (inf) loss_scale 2048.0000 (2698.1587) mem 22339MB +[2024-07-25 06:35:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][450/625] eta 0:01:42 lr 0.000595 wd 0.0500 time 0.6918 (0.5866) data time 0.0006 (0.0017) model time 0.6912 (0.5861) loss 6.3492 (7.3933) grad_norm 2.5023 (inf) loss_scale 2048.0000 (2683.7428) mem 22339MB +[2024-07-25 06:35:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][460/625] eta 0:01:36 lr 0.000594 wd 0.0500 time 0.5718 (0.5863) data time 0.0006 (0.0016) model time 0.5712 (0.5858) loss 7.0967 (7.4066) grad_norm 3.1521 (inf) loss_scale 2048.0000 (2669.9523) mem 22339MB +[2024-07-25 06:35:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][470/625] eta 0:01:30 lr 0.000594 wd 0.0500 time 0.5642 (0.5860) data time 0.0007 (0.0016) model time 0.5635 (0.5855) loss 6.9868 (7.3955) grad_norm 1.5341 (inf) loss_scale 2048.0000 (2656.7473) mem 22339MB +[2024-07-25 06:35:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][480/625] eta 0:01:24 lr 0.000594 wd 0.0500 time 0.5737 (0.5858) data time 0.0006 (0.0016) model time 0.5731 (0.5852) loss 7.5646 (7.4028) grad_norm 2.2991 (inf) loss_scale 2048.0000 (2644.0915) mem 22339MB +[2024-07-25 06:35:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][490/625] eta 0:01:19 lr 0.000594 wd 0.0500 time 0.5648 (0.5856) data time 0.0008 (0.0016) model time 0.5640 (0.5849) loss 6.7795 (7.3987) grad_norm 1.8926 (inf) loss_scale 2048.0000 (2631.9511) mem 22339MB +[2024-07-25 06:36:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][500/625] eta 0:01:13 lr 0.000594 wd 0.0500 time 0.5799 (0.5854) data time 0.0008 (0.0016) model time 0.5792 (0.5847) loss 7.2472 (7.3967) grad_norm 1.8114 (inf) loss_scale 2048.0000 (2620.2954) mem 22339MB +[2024-07-25 06:36:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][510/625] eta 0:01:07 lr 0.000594 wd 0.0500 time 0.5703 (0.5852) data time 0.0006 (0.0016) model time 0.5697 (0.5845) loss 8.2178 (7.3983) grad_norm 2.1158 (inf) loss_scale 2048.0000 (2609.0959) mem 22339MB +[2024-07-25 06:36:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][520/625] eta 0:01:01 lr 0.000594 wd 0.0500 time 0.5738 (0.5850) data time 0.0006 (0.0015) model time 0.5733 (0.5843) loss 7.9971 (7.3996) grad_norm 2.7753 (inf) loss_scale 2048.0000 (2598.3263) mem 22339MB +[2024-07-25 06:36:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][530/625] eta 0:00:55 lr 0.000594 wd 0.0500 time 0.5699 (0.5848) data time 0.0008 (0.0015) model time 0.5692 (0.5841) loss 6.4817 (7.3956) grad_norm 3.2818 (inf) loss_scale 2048.0000 (2587.9623) mem 22339MB +[2024-07-25 06:36:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][540/625] eta 0:00:49 lr 0.000594 wd 0.0500 time 0.5759 (0.5846) data time 0.0006 (0.0015) model time 0.5753 (0.5839) loss 6.4897 (7.3999) grad_norm 3.3572 (inf) loss_scale 2048.0000 (2577.9815) mem 22339MB +[2024-07-25 06:36:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][550/625] eta 0:00:43 lr 0.000593 wd 0.0500 time 0.5721 (0.5844) data time 0.0006 (0.0015) model time 0.5715 (0.5837) loss 5.3595 (7.4021) grad_norm 2.1114 (inf) loss_scale 2048.0000 (2568.3630) mem 22339MB +[2024-07-25 06:36:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][560/625] eta 0:00:37 lr 0.000593 wd 0.0500 time 0.5729 (0.5843) data time 0.0006 (0.0015) model time 0.5724 (0.5835) loss 7.8481 (7.4033) grad_norm 3.2308 (inf) loss_scale 2048.0000 (2559.0873) mem 22339MB +[2024-07-25 06:36:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][570/625] eta 0:00:32 lr 0.000593 wd 0.0500 time 0.5721 (0.5841) data time 0.0006 (0.0015) model time 0.5715 (0.5833) loss 7.8559 (7.4055) grad_norm 2.0466 (inf) loss_scale 2048.0000 (2550.1366) mem 22339MB +[2024-07-25 06:36:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][580/625] eta 0:00:26 lr 0.000593 wd 0.0500 time 0.5734 (0.5840) data time 0.0006 (0.0015) model time 0.5728 (0.5832) loss 7.9217 (7.4053) grad_norm 2.0189 (inf) loss_scale 2048.0000 (2541.4940) mem 22339MB +[2024-07-25 06:36:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][590/625] eta 0:00:20 lr 0.000593 wd 0.0500 time 0.5622 (0.5839) data time 0.0006 (0.0015) model time 0.5615 (0.5831) loss 7.0496 (7.4041) grad_norm 2.2981 (inf) loss_scale 2048.0000 (2533.1438) mem 22339MB +[2024-07-25 06:36:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][600/625] eta 0:00:14 lr 0.000593 wd 0.0500 time 0.5750 (0.5837) data time 0.0008 (0.0015) model time 0.5742 (0.5829) loss 6.3962 (7.4072) grad_norm 2.1415 (inf) loss_scale 2048.0000 (2525.0715) mem 22339MB +[2024-07-25 06:37:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][610/625] eta 0:00:08 lr 0.000593 wd 0.0500 time 0.5619 (0.5837) data time 0.0004 (0.0015) model time 0.5615 (0.5828) loss 6.6184 (7.3995) grad_norm 3.3954 (inf) loss_scale 2048.0000 (2517.2635) mem 22339MB +[2024-07-25 06:37:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [161/300][620/625] eta 0:00:02 lr 0.000593 wd 0.0500 time 0.5730 (0.5835) data time 0.0006 (0.0014) model time 0.5724 (0.5827) loss 7.8374 (7.4037) grad_norm 2.1934 (inf) loss_scale 2048.0000 (2509.7069) mem 22339MB +[2024-07-25 06:37:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 161 training takes 0:06:04 +[2024-07-25 06:37:13 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 06:37:15 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 06:37:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.464 (0.464) Loss 0.5337 (0.5337) Acc@1 89.502 (89.502) Acc@5 98.584 (98.584) Mem 22339MB +[2024-07-25 06:37:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.156) Loss 0.8389 (0.6657) Acc@1 80.420 (86.222) Acc@5 96.045 (97.599) Mem 22339MB +[2024-07-25 06:37:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.9404 (0.7692) Acc@1 77.246 (83.147) Acc@5 95.020 (96.538) Mem 22339MB +[2024-07-25 06:37:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.804 Acc@5 96.539 +[2024-07-25 06:37:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.8% +[2024-07-25 06:37:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.828 (0.828) Loss 0.4939 (0.4939) Acc@1 89.795 (89.795) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-25 06:37:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.190) Loss 0.7651 (0.6226) Acc@1 81.934 (86.834) Acc@5 96.484 (97.847) Mem 22339MB +[2024-07-25 06:37:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.159) Loss 0.8945 (0.7237) Acc@1 78.223 (83.705) Acc@5 95.654 (96.845) Mem 22339MB +[2024-07-25 06:37:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.311 Acc@5 96.847 +[2024-07-25 06:37:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.3% +[2024-07-25 06:37:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.31% +[2024-07-25 06:37:22 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 06:37:23 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 06:37:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][0/625] eta 0:09:16 lr 0.000593 wd 0.0500 time 0.8909 (0.8909) data time 0.3721 (0.3721) model time 0.0000 (0.0000) loss 7.7901 (7.7901) grad_norm 2.9498 (2.9498) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:37:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][10/625] eta 0:06:24 lr 0.000593 wd 0.0500 time 0.5728 (0.6251) data time 0.0006 (0.0345) model time 0.0000 (0.0000) loss 7.2800 (7.4009) grad_norm 1.8011 (2.6217) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:37:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][20/625] eta 0:06:18 lr 0.000592 wd 0.0500 time 0.7366 (0.6260) data time 0.0006 (0.0184) model time 0.0000 (0.0000) loss 8.3095 (7.3455) grad_norm 2.4835 (2.6084) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:37:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][30/625] eta 0:06:11 lr 0.000592 wd 0.0500 time 0.5715 (0.6243) data time 0.0008 (0.0127) model time 0.0000 (0.0000) loss 8.5077 (7.4003) grad_norm 2.0144 (2.5377) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:37:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][40/625] eta 0:06:02 lr 0.000592 wd 0.0500 time 0.5715 (0.6195) data time 0.0007 (0.0098) model time 0.0000 (0.0000) loss 8.5738 (7.4319) grad_norm 3.6372 (2.4813) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:37:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][50/625] eta 0:05:53 lr 0.000592 wd 0.0500 time 0.5628 (0.6142) data time 0.0006 (0.0080) model time 0.0000 (0.0000) loss 7.0892 (7.4434) grad_norm 2.1486 (2.4872) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:38:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][60/625] eta 0:05:43 lr 0.000592 wd 0.0500 time 0.5747 (0.6075) data time 0.0007 (0.0069) model time 0.5740 (0.5726) loss 8.0312 (7.4513) grad_norm 2.1737 (2.4568) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:38:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][70/625] eta 0:05:34 lr 0.000592 wd 0.0500 time 0.5699 (0.6026) data time 0.0006 (0.0060) model time 0.5692 (0.5723) loss 7.3064 (7.4357) grad_norm 3.2711 (2.5108) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:38:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][80/625] eta 0:05:26 lr 0.000592 wd 0.0500 time 0.5669 (0.5994) data time 0.0009 (0.0054) model time 0.5659 (0.5734) loss 7.8801 (7.5178) grad_norm 1.9354 (2.4696) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:38:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][90/625] eta 0:05:19 lr 0.000592 wd 0.0500 time 0.5729 (0.5970) data time 0.0009 (0.0049) model time 0.5720 (0.5743) loss 6.2946 (7.5339) grad_norm 1.4872 (2.3983) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:38:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][100/625] eta 0:05:12 lr 0.000592 wd 0.0500 time 0.5632 (0.5957) data time 0.0006 (0.0046) model time 0.5626 (0.5759) loss 8.1882 (7.5597) grad_norm 2.8792 (2.3622) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:38:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][110/625] eta 0:05:06 lr 0.000591 wd 0.0500 time 0.5677 (0.5943) data time 0.0008 (0.0042) model time 0.5669 (0.5764) loss 7.0775 (7.5511) grad_norm 5.1663 (2.4361) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:38:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][120/625] eta 0:04:59 lr 0.000591 wd 0.0500 time 0.5717 (0.5933) data time 0.0010 (0.0040) model time 0.5707 (0.5770) loss 6.9634 (7.5691) grad_norm 2.6201 (2.4698) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:38:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][130/625] eta 0:04:53 lr 0.000591 wd 0.0500 time 0.5713 (0.5919) data time 0.0008 (0.0037) model time 0.5704 (0.5768) loss 7.7648 (7.5664) grad_norm 3.1764 (2.4788) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:38:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][140/625] eta 0:04:47 lr 0.000591 wd 0.0500 time 0.5662 (0.5919) data time 0.0008 (0.0035) model time 0.5654 (0.5783) loss 6.9299 (7.5702) grad_norm 3.0807 (2.4773) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:38:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][150/625] eta 0:04:40 lr 0.000591 wd 0.0500 time 0.5667 (0.5908) data time 0.0006 (0.0033) model time 0.5661 (0.5779) loss 6.8139 (7.5647) grad_norm 3.0039 (2.4814) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:38:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][160/625] eta 0:04:34 lr 0.000591 wd 0.0500 time 0.5678 (0.5898) data time 0.0008 (0.0032) model time 0.5670 (0.5776) loss 7.9562 (7.5178) grad_norm 3.0215 (2.4931) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:39:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][170/625] eta 0:04:28 lr 0.000591 wd 0.0500 time 0.5713 (0.5891) data time 0.0008 (0.0030) model time 0.5705 (0.5775) loss 8.0886 (7.5139) grad_norm 1.8503 (2.4775) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:39:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][180/625] eta 0:04:22 lr 0.000591 wd 0.0500 time 0.5718 (0.5889) data time 0.0008 (0.0032) model time 0.5711 (0.5776) loss 8.6631 (7.5137) grad_norm 2.6653 (2.4730) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:39:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][190/625] eta 0:04:15 lr 0.000591 wd 0.0500 time 0.5712 (0.5881) data time 0.0008 (0.0031) model time 0.5704 (0.5774) loss 7.9563 (7.5067) grad_norm 1.7959 (2.4571) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:39:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][200/625] eta 0:04:09 lr 0.000591 wd 0.0500 time 0.5695 (0.5876) data time 0.0008 (0.0030) model time 0.5687 (0.5772) loss 6.6467 (7.4927) grad_norm 2.4181 (2.4434) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:39:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][210/625] eta 0:04:03 lr 0.000590 wd 0.0500 time 0.5710 (0.5870) data time 0.0008 (0.0029) model time 0.5702 (0.5770) loss 6.3138 (7.4916) grad_norm 2.1060 (2.4324) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:39:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][220/625] eta 0:03:57 lr 0.000590 wd 0.0500 time 0.6290 (0.5867) data time 0.0006 (0.0028) model time 0.6284 (0.5772) loss 7.0286 (7.4888) grad_norm 2.7766 (2.4266) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:39:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][230/625] eta 0:03:52 lr 0.000590 wd 0.0500 time 0.7487 (0.5883) data time 0.0006 (0.0028) model time 0.7481 (0.5797) loss 7.2826 (7.4910) grad_norm 1.7043 (2.4368) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:39:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][240/625] eta 0:03:46 lr 0.000590 wd 0.0500 time 0.5621 (0.5891) data time 0.0007 (0.0027) model time 0.5614 (0.5811) loss 6.1905 (7.4886) grad_norm 1.8603 (2.4270) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:39:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][250/625] eta 0:03:41 lr 0.000590 wd 0.0500 time 0.5653 (0.5916) data time 0.0008 (0.0026) model time 0.5645 (0.5846) loss 6.0099 (7.4757) grad_norm 1.9166 (2.4122) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:39:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][260/625] eta 0:03:36 lr 0.000590 wd 0.0500 time 0.5711 (0.5921) data time 0.0006 (0.0026) model time 0.5705 (0.5856) loss 6.6053 (7.4795) grad_norm 1.8877 (2.4101) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:40:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][270/625] eta 0:03:30 lr 0.000590 wd 0.0500 time 0.5736 (0.5921) data time 0.0006 (0.0025) model time 0.5730 (0.5858) loss 6.8832 (7.4751) grad_norm 2.6994 (2.4157) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:40:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][280/625] eta 0:03:24 lr 0.000590 wd 0.0500 time 0.5657 (0.5915) data time 0.0006 (0.0024) model time 0.5651 (0.5853) loss 7.9638 (7.4775) grad_norm 2.8554 (2.4156) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:40:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][290/625] eta 0:03:17 lr 0.000590 wd 0.0500 time 0.5673 (0.5910) data time 0.0006 (0.0024) model time 0.5666 (0.5849) loss 7.4153 (7.4718) grad_norm 1.5914 (2.4011) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:40:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][300/625] eta 0:03:12 lr 0.000589 wd 0.0500 time 0.5705 (0.5909) data time 0.0006 (0.0023) model time 0.5699 (0.5849) loss 6.7698 (7.4697) grad_norm 2.3379 (2.4052) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:40:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][310/625] eta 0:03:06 lr 0.000589 wd 0.0500 time 0.5677 (0.5905) data time 0.0008 (0.0023) model time 0.5669 (0.5847) loss 7.0974 (7.4761) grad_norm 1.8412 (2.4246) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:40:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][320/625] eta 0:03:00 lr 0.000589 wd 0.0500 time 0.5642 (0.5908) data time 0.0008 (0.0023) model time 0.5633 (0.5852) loss 5.4622 (7.4660) grad_norm 1.9880 (2.4280) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:40:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][330/625] eta 0:02:54 lr 0.000589 wd 0.0500 time 0.5623 (0.5905) data time 0.0008 (0.0022) model time 0.5615 (0.5850) loss 8.5053 (7.4547) grad_norm 2.5217 (2.4517) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:40:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][340/625] eta 0:02:48 lr 0.000589 wd 0.0500 time 0.5640 (0.5901) data time 0.0008 (0.0022) model time 0.5632 (0.5847) loss 9.1101 (7.4542) grad_norm 1.8954 (2.4597) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:40:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][350/625] eta 0:02:42 lr 0.000589 wd 0.0500 time 0.5696 (0.5897) data time 0.0006 (0.0021) model time 0.5690 (0.5844) loss 6.4963 (7.4509) grad_norm 1.7291 (2.4605) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:40:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][360/625] eta 0:02:36 lr 0.000589 wd 0.0500 time 0.5589 (0.5899) data time 0.0008 (0.0021) model time 0.5581 (0.5847) loss 8.6365 (7.4489) grad_norm 1.7863 (2.4548) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:41:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][370/625] eta 0:02:30 lr 0.000589 wd 0.0500 time 0.5631 (0.5896) data time 0.0006 (0.0021) model time 0.5625 (0.5845) loss 9.1290 (7.4483) grad_norm 3.0057 (2.4434) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:41:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][380/625] eta 0:02:24 lr 0.000589 wd 0.0500 time 0.5718 (0.5893) data time 0.0008 (0.0021) model time 0.5711 (0.5843) loss 6.6937 (7.4306) grad_norm 2.7136 (2.4338) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:41:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][390/625] eta 0:02:18 lr 0.000589 wd 0.0500 time 0.5730 (0.5891) data time 0.0006 (0.0022) model time 0.5724 (0.5840) loss 5.8160 (7.4383) grad_norm 4.6502 (2.4320) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:41:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][400/625] eta 0:02:12 lr 0.000588 wd 0.0500 time 0.5700 (0.5888) data time 0.0006 (0.0021) model time 0.5694 (0.5838) loss 7.1047 (7.4475) grad_norm 2.1686 (2.4365) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:41:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][410/625] eta 0:02:06 lr 0.000588 wd 0.0500 time 0.5601 (0.5885) data time 0.0006 (0.0021) model time 0.5594 (0.5836) loss 6.9753 (7.4360) grad_norm 1.7669 (2.4361) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:41:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][420/625] eta 0:02:00 lr 0.000588 wd 0.0500 time 0.5718 (0.5882) data time 0.0008 (0.0021) model time 0.5711 (0.5834) loss 5.9610 (7.4322) grad_norm 2.2833 (2.4335) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:41:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][430/625] eta 0:01:54 lr 0.000588 wd 0.0500 time 0.5722 (0.5880) data time 0.0008 (0.0020) model time 0.5714 (0.5832) loss 8.3582 (7.4335) grad_norm 2.5488 (2.4304) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:41:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][440/625] eta 0:01:48 lr 0.000588 wd 0.0500 time 0.7078 (0.5880) data time 0.0006 (0.0020) model time 0.7072 (0.5833) loss 6.5869 (7.4335) grad_norm 1.8972 (2.4261) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:41:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][450/625] eta 0:01:42 lr 0.000588 wd 0.0500 time 0.7202 (0.5883) data time 0.0006 (0.0020) model time 0.7195 (0.5838) loss 7.2799 (7.4325) grad_norm 2.4988 (2.4195) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:41:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][460/625] eta 0:01:37 lr 0.000588 wd 0.0500 time 0.7343 (0.5891) data time 0.0006 (0.0019) model time 0.7337 (0.5848) loss 6.6312 (7.4322) grad_norm 1.9347 (2.4136) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:42:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][470/625] eta 0:01:31 lr 0.000588 wd 0.0500 time 0.5725 (0.5907) data time 0.0008 (0.0019) model time 0.5718 (0.5866) loss 5.8392 (7.4222) grad_norm 1.7723 (2.4093) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:42:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][480/625] eta 0:01:25 lr 0.000588 wd 0.0500 time 0.5715 (0.5914) data time 0.0008 (0.0019) model time 0.5707 (0.5875) loss 8.0441 (7.4258) grad_norm 1.5597 (2.4127) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:42:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][490/625] eta 0:01:19 lr 0.000587 wd 0.0500 time 0.5627 (0.5915) data time 0.0008 (0.0019) model time 0.5619 (0.5877) loss 5.9880 (7.4317) grad_norm 2.2081 (2.4078) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:42:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][500/625] eta 0:01:13 lr 0.000587 wd 0.0500 time 0.5629 (0.5912) data time 0.0006 (0.0019) model time 0.5622 (0.5875) loss 7.4370 (7.4244) grad_norm 2.1316 (2.4065) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:42:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][510/625] eta 0:01:07 lr 0.000587 wd 0.0500 time 0.5616 (0.5911) data time 0.0006 (0.0019) model time 0.5610 (0.5874) loss 6.0665 (7.4260) grad_norm 3.0287 (2.4113) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:42:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][520/625] eta 0:01:02 lr 0.000587 wd 0.0500 time 0.5739 (0.5909) data time 0.0006 (0.0018) model time 0.5732 (0.5871) loss 6.3647 (7.4233) grad_norm 1.8241 (2.4192) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:42:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][530/625] eta 0:00:56 lr 0.000587 wd 0.0500 time 0.5678 (0.5906) data time 0.0009 (0.0018) model time 0.5669 (0.5869) loss 6.9948 (7.4148) grad_norm 1.5467 (2.4127) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:42:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][540/625] eta 0:00:50 lr 0.000587 wd 0.0500 time 0.5754 (0.5904) data time 0.0008 (0.0018) model time 0.5747 (0.5867) loss 6.0908 (7.4114) grad_norm 3.4469 (2.4099) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:42:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][550/625] eta 0:00:44 lr 0.000587 wd 0.0500 time 0.5667 (0.5902) data time 0.0007 (0.0018) model time 0.5660 (0.5866) loss 6.1608 (7.4119) grad_norm 1.9384 (2.4055) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:42:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][560/625] eta 0:00:38 lr 0.000587 wd 0.0500 time 0.5602 (0.5902) data time 0.0007 (0.0018) model time 0.5595 (0.5866) loss 8.3889 (7.4164) grad_norm 1.6050 (2.3980) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:43:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][570/625] eta 0:00:32 lr 0.000587 wd 0.0500 time 0.5703 (0.5900) data time 0.0007 (0.0018) model time 0.5697 (0.5865) loss 6.7786 (7.4130) grad_norm 1.5825 (2.3988) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:43:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][580/625] eta 0:00:26 lr 0.000586 wd 0.0500 time 0.5764 (0.5901) data time 0.0008 (0.0018) model time 0.5757 (0.5867) loss 8.7520 (7.4115) grad_norm 2.3493 (2.3973) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:43:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][590/625] eta 0:00:20 lr 0.000586 wd 0.0500 time 0.5616 (0.5900) data time 0.0008 (0.0018) model time 0.5608 (0.5866) loss 8.6569 (7.4109) grad_norm 2.4451 (2.3944) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:43:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][600/625] eta 0:00:14 lr 0.000586 wd 0.0500 time 0.5674 (0.5899) data time 0.0006 (0.0018) model time 0.5668 (0.5864) loss 6.2153 (7.4115) grad_norm 3.0776 (2.3945) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:43:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][610/625] eta 0:00:08 lr 0.000586 wd 0.0500 time 0.5689 (0.5897) data time 0.0004 (0.0018) model time 0.5686 (0.5863) loss 7.7825 (7.4152) grad_norm 2.0745 (2.3933) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:43:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [162/300][620/625] eta 0:00:02 lr 0.000586 wd 0.0500 time 0.5616 (0.5896) data time 0.0006 (0.0018) model time 0.5610 (0.5862) loss 8.2511 (7.4143) grad_norm 2.2401 (2.3924) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:43:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 162 training takes 0:06:08 +[2024-07-25 06:43:32 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 06:43:33 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 06:43:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.477 (0.477) Loss 0.5132 (0.5132) Acc@1 89.697 (89.697) Acc@5 98.730 (98.730) Mem 22339MB +[2024-07-25 06:43:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.161) Loss 0.8179 (0.6509) Acc@1 81.006 (86.390) Acc@5 95.996 (97.652) Mem 22339MB +[2024-07-25 06:43:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.144) Loss 0.9287 (0.7595) Acc@1 77.637 (83.238) Acc@5 95.557 (96.587) Mem 22339MB +[2024-07-25 06:43:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.831 Acc@5 96.587 +[2024-07-25 06:43:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.8% +[2024-07-25 06:43:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.822 (0.822) Loss 0.4941 (0.4941) Acc@1 89.795 (89.795) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 06:43:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.192) Loss 0.7637 (0.6223) Acc@1 81.885 (86.847) Acc@5 96.533 (97.856) Mem 22339MB +[2024-07-25 06:43:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.163) Loss 0.8931 (0.7232) Acc@1 78.369 (83.738) Acc@5 95.703 (96.863) Mem 22339MB +[2024-07-25 06:43:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.333 Acc@5 96.869 +[2024-07-25 06:43:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.3% +[2024-07-25 06:43:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.33% +[2024-07-25 06:43:41 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 06:43:42 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 06:43:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][0/625] eta 0:09:06 lr 0.000586 wd 0.0500 time 0.8737 (0.8737) data time 0.3560 (0.3560) model time 0.0000 (0.0000) loss 8.9753 (8.9753) grad_norm 1.9160 (1.9160) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:43:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][10/625] eta 0:06:17 lr 0.000586 wd 0.0500 time 0.5679 (0.6131) data time 0.0008 (0.0336) model time 0.0000 (0.0000) loss 6.2542 (7.6905) grad_norm 2.4485 (2.2484) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:43:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][20/625] eta 0:06:03 lr 0.000586 wd 0.0500 time 0.5686 (0.6015) data time 0.0007 (0.0182) model time 0.0000 (0.0000) loss 6.5740 (7.4347) grad_norm 2.4099 (2.2758) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:44:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][30/625] eta 0:05:54 lr 0.000586 wd 0.0500 time 0.5638 (0.5952) data time 0.0008 (0.0126) model time 0.0000 (0.0000) loss 8.5652 (7.3939) grad_norm 1.7051 (2.3179) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:44:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][40/625] eta 0:05:49 lr 0.000586 wd 0.0500 time 0.5981 (0.5973) data time 0.0006 (0.0102) model time 0.0000 (0.0000) loss 7.4939 (7.3630) grad_norm 2.3891 (2.2967) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:44:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][50/625] eta 0:05:46 lr 0.000585 wd 0.0500 time 0.6988 (0.6034) data time 0.0009 (0.0084) model time 0.0000 (0.0000) loss 8.5507 (7.4307) grad_norm 1.9788 (2.2626) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:44:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][60/625] eta 0:05:42 lr 0.000585 wd 0.0500 time 0.6834 (0.6059) data time 0.0008 (0.0072) model time 0.6826 (0.6177) loss 8.0557 (7.4973) grad_norm 2.4971 (2.2346) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:44:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][70/625] eta 0:05:41 lr 0.000585 wd 0.0500 time 0.6994 (0.6147) data time 0.0006 (0.0064) model time 0.6988 (0.6424) loss 7.8305 (7.4275) grad_norm 2.2254 (2.2878) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:44:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][80/625] eta 0:05:33 lr 0.000585 wd 0.0500 time 0.5622 (0.6123) data time 0.0006 (0.0058) model time 0.5616 (0.6261) loss 7.4844 (7.4690) grad_norm 1.6821 (2.2576) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:44:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][90/625] eta 0:05:26 lr 0.000585 wd 0.0500 time 0.5602 (0.6108) data time 0.0006 (0.0053) model time 0.5595 (0.6189) loss 7.9440 (7.3972) grad_norm 3.0046 (2.3300) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:44:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][100/625] eta 0:05:19 lr 0.000585 wd 0.0500 time 0.5702 (0.6094) data time 0.0008 (0.0049) model time 0.5694 (0.6143) loss 7.9643 (7.4130) grad_norm 2.3709 (2.3442) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:44:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][110/625] eta 0:05:13 lr 0.000585 wd 0.0500 time 0.5742 (0.6081) data time 0.0007 (0.0045) model time 0.5735 (0.6109) loss 7.2525 (7.3789) grad_norm 2.9669 (2.3459) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:44:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][120/625] eta 0:05:06 lr 0.000585 wd 0.0500 time 0.5663 (0.6064) data time 0.0008 (0.0043) model time 0.5655 (0.6074) loss 7.4353 (7.3917) grad_norm 1.7989 (2.3267) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:45:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][130/625] eta 0:04:59 lr 0.000585 wd 0.0500 time 0.5618 (0.6054) data time 0.0006 (0.0041) model time 0.5612 (0.6053) loss 7.9450 (7.3761) grad_norm 2.0109 (2.3285) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:45:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][140/625] eta 0:04:53 lr 0.000585 wd 0.0500 time 0.5639 (0.6043) data time 0.0007 (0.0039) model time 0.5632 (0.6035) loss 8.3634 (7.3895) grad_norm 2.8861 (2.3255) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:45:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][150/625] eta 0:04:46 lr 0.000584 wd 0.0500 time 0.5731 (0.6037) data time 0.0008 (0.0039) model time 0.5723 (0.6024) loss 6.9213 (7.3706) grad_norm 1.7904 (2.3398) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:45:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][160/625] eta 0:04:40 lr 0.000584 wd 0.0500 time 0.5656 (0.6029) data time 0.0006 (0.0037) model time 0.5650 (0.6012) loss 7.1153 (7.3719) grad_norm 2.5077 (2.3565) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:45:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][170/625] eta 0:04:34 lr 0.000584 wd 0.0500 time 0.5610 (0.6024) data time 0.0006 (0.0037) model time 0.5604 (0.6003) loss 8.5919 (7.3785) grad_norm 2.0475 (2.3337) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:45:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][180/625] eta 0:04:27 lr 0.000584 wd 0.0500 time 0.5604 (0.6021) data time 0.0007 (0.0036) model time 0.5596 (0.6000) loss 6.0769 (7.3395) grad_norm 2.2983 (2.3313) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:45:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][190/625] eta 0:04:21 lr 0.000584 wd 0.0500 time 0.5577 (0.6020) data time 0.0007 (0.0035) model time 0.5570 (0.5999) loss 6.9588 (7.3475) grad_norm 1.7915 (2.3328) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:45:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][200/625] eta 0:04:15 lr 0.000584 wd 0.0500 time 0.5709 (0.6016) data time 0.0009 (0.0034) model time 0.5700 (0.5993) loss 8.5992 (7.3638) grad_norm 1.8034 (2.3679) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:45:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][210/625] eta 0:04:09 lr 0.000584 wd 0.0500 time 0.5619 (0.6010) data time 0.0006 (0.0033) model time 0.5612 (0.5985) loss 7.2340 (7.3762) grad_norm 1.7641 (2.3677) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:45:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][220/625] eta 0:04:03 lr 0.000584 wd 0.0500 time 0.5638 (0.6009) data time 0.0008 (0.0032) model time 0.5630 (0.5985) loss 8.7359 (7.3496) grad_norm 1.8168 (2.3591) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:46:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][230/625] eta 0:03:57 lr 0.000584 wd 0.0500 time 0.5630 (0.6008) data time 0.0008 (0.0031) model time 0.5623 (0.5984) loss 6.2519 (7.3492) grad_norm 2.0996 (2.3507) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:46:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][240/625] eta 0:03:51 lr 0.000583 wd 0.0500 time 0.5724 (0.6006) data time 0.0006 (0.0031) model time 0.5718 (0.5982) loss 6.8037 (7.3302) grad_norm 1.5595 (2.3382) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:46:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][250/625] eta 0:03:44 lr 0.000583 wd 0.0500 time 0.5611 (0.6000) data time 0.0008 (0.0031) model time 0.5603 (0.5974) loss 7.6822 (7.3219) grad_norm 2.6864 (2.3275) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:46:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][260/625] eta 0:03:39 lr 0.000583 wd 0.0500 time 0.6742 (0.6005) data time 0.0006 (0.0030) model time 0.6736 (0.5981) loss 5.8152 (7.3207) grad_norm 2.6148 (2.3390) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:46:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][270/625] eta 0:03:33 lr 0.000583 wd 0.0500 time 0.5614 (0.6019) data time 0.0006 (0.0030) model time 0.5608 (0.5999) loss 4.9588 (7.3218) grad_norm 4.1566 (2.3668) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:46:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][280/625] eta 0:03:28 lr 0.000583 wd 0.0500 time 0.5609 (0.6029) data time 0.0007 (0.0030) model time 0.5603 (0.6011) loss 7.2021 (7.3185) grad_norm 2.3885 (2.3788) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:46:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][290/625] eta 0:03:22 lr 0.000583 wd 0.0500 time 0.5620 (0.6045) data time 0.0008 (0.0029) model time 0.5612 (0.6029) loss 9.1173 (7.3244) grad_norm 2.8068 (2.3813) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:46:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][300/625] eta 0:03:16 lr 0.000583 wd 0.0500 time 0.7108 (0.6049) data time 0.0008 (0.0029) model time 0.7100 (0.6035) loss 6.5120 (7.3328) grad_norm 3.0258 (2.4062) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:46:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][310/625] eta 0:03:10 lr 0.000583 wd 0.0500 time 0.5624 (0.6046) data time 0.0006 (0.0028) model time 0.5618 (0.6031) loss 6.4660 (7.3413) grad_norm 2.9492 (2.4223) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:46:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][320/625] eta 0:03:04 lr 0.000583 wd 0.0500 time 0.5601 (0.6040) data time 0.0007 (0.0028) model time 0.5593 (0.6024) loss 8.9183 (7.3641) grad_norm 1.6720 (2.4095) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:47:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][330/625] eta 0:02:58 lr 0.000582 wd 0.0500 time 0.5585 (0.6040) data time 0.0007 (0.0027) model time 0.5578 (0.6024) loss 7.9570 (7.3687) grad_norm 2.6663 (2.3993) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:47:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][340/625] eta 0:02:52 lr 0.000582 wd 0.0500 time 0.5623 (0.6037) data time 0.0008 (0.0027) model time 0.5615 (0.6021) loss 8.4004 (7.3815) grad_norm 2.4350 (2.3931) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:47:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][350/625] eta 0:02:45 lr 0.000582 wd 0.0500 time 0.5639 (0.6033) data time 0.0008 (0.0026) model time 0.5631 (0.6017) loss 8.0528 (7.3856) grad_norm 1.9332 (2.3829) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:47:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][360/625] eta 0:02:39 lr 0.000582 wd 0.0500 time 0.5704 (0.6029) data time 0.0006 (0.0026) model time 0.5698 (0.6012) loss 8.4514 (7.3921) grad_norm 2.1017 (2.3768) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:47:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][370/625] eta 0:02:33 lr 0.000582 wd 0.0500 time 0.5695 (0.6022) data time 0.0008 (0.0026) model time 0.5687 (0.6004) loss 7.7062 (7.3929) grad_norm 1.8757 (2.3732) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:47:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][380/625] eta 0:02:27 lr 0.000582 wd 0.0500 time 0.5703 (0.6017) data time 0.0006 (0.0025) model time 0.5697 (0.5998) loss 7.3994 (7.4029) grad_norm 1.9097 (2.3780) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:47:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][390/625] eta 0:02:21 lr 0.000582 wd 0.0500 time 0.5628 (0.6013) data time 0.0006 (0.0025) model time 0.5622 (0.5994) loss 7.0727 (7.3934) grad_norm 2.1366 (2.3701) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:47:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][400/625] eta 0:02:15 lr 0.000582 wd 0.0500 time 0.5705 (0.6009) data time 0.0007 (0.0025) model time 0.5698 (0.5990) loss 7.1831 (7.3979) grad_norm 2.8158 (2.3795) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:47:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][410/625] eta 0:02:09 lr 0.000582 wd 0.0500 time 0.5652 (0.6005) data time 0.0007 (0.0024) model time 0.5646 (0.5985) loss 6.4499 (7.3978) grad_norm 3.2586 (2.3896) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:47:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][420/625] eta 0:02:03 lr 0.000582 wd 0.0500 time 0.5710 (0.6000) data time 0.0008 (0.0024) model time 0.5702 (0.5980) loss 8.7875 (7.3878) grad_norm 1.5859 (2.3851) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:48:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][430/625] eta 0:01:56 lr 0.000581 wd 0.0500 time 0.5747 (0.5996) data time 0.0006 (0.0024) model time 0.5740 (0.5976) loss 8.0449 (7.3935) grad_norm 2.0843 (2.3750) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:48:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][440/625] eta 0:01:50 lr 0.000581 wd 0.0500 time 0.5659 (0.5991) data time 0.0008 (0.0023) model time 0.5652 (0.5970) loss 6.8892 (7.3850) grad_norm 2.5467 (2.3717) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:48:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][450/625] eta 0:01:44 lr 0.000581 wd 0.0500 time 0.5681 (0.5986) data time 0.0008 (0.0023) model time 0.5673 (0.5965) loss 7.0265 (7.3840) grad_norm 1.9276 (2.3654) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:48:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][460/625] eta 0:01:38 lr 0.000581 wd 0.0500 time 0.5675 (0.5982) data time 0.0008 (0.0023) model time 0.5667 (0.5960) loss 7.6427 (7.3812) grad_norm 2.6771 (2.3734) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:48:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][470/625] eta 0:01:32 lr 0.000581 wd 0.0500 time 0.5707 (0.5978) data time 0.0007 (0.0023) model time 0.5700 (0.5956) loss 7.4850 (7.3795) grad_norm 2.4168 (2.3768) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:48:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][480/625] eta 0:01:26 lr 0.000581 wd 0.0500 time 0.5606 (0.5979) data time 0.0006 (0.0022) model time 0.5600 (0.5958) loss 7.0537 (7.3790) grad_norm 2.5930 (2.3890) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:48:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][490/625] eta 0:01:20 lr 0.000581 wd 0.0500 time 0.5633 (0.5981) data time 0.0009 (0.0022) model time 0.5625 (0.5961) loss 7.0688 (7.3862) grad_norm 4.9167 (2.4021) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:48:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][500/625] eta 0:01:14 lr 0.000581 wd 0.0500 time 0.7454 (0.5986) data time 0.0006 (0.0022) model time 0.7448 (0.5966) loss 7.7984 (7.3934) grad_norm 1.8750 (2.4020) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:48:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][510/625] eta 0:01:08 lr 0.000581 wd 0.0500 time 0.7205 (0.5989) data time 0.0008 (0.0021) model time 0.7197 (0.5970) loss 9.0072 (7.3947) grad_norm 2.8261 (2.4063) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:48:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][520/625] eta 0:01:02 lr 0.000580 wd 0.0500 time 0.5683 (0.5988) data time 0.0006 (0.0021) model time 0.5677 (0.5969) loss 5.4766 (7.3948) grad_norm 2.4099 (2.4091) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:49:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][530/625] eta 0:00:56 lr 0.000580 wd 0.0500 time 0.5732 (0.5985) data time 0.0008 (0.0021) model time 0.5724 (0.5966) loss 6.3095 (7.3966) grad_norm 2.1480 (2.4027) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:49:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][540/625] eta 0:00:50 lr 0.000580 wd 0.0500 time 0.5725 (0.5981) data time 0.0006 (0.0021) model time 0.5719 (0.5961) loss 7.1704 (7.3957) grad_norm 2.1451 (2.4058) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:49:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][550/625] eta 0:00:44 lr 0.000580 wd 0.0500 time 0.5642 (0.5980) data time 0.0006 (0.0020) model time 0.5636 (0.5960) loss 6.2182 (7.3989) grad_norm 2.3604 (2.4147) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:49:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][560/625] eta 0:00:38 lr 0.000580 wd 0.0500 time 0.5735 (0.5976) data time 0.0006 (0.0020) model time 0.5728 (0.5956) loss 6.3267 (7.3933) grad_norm 2.4648 (2.4303) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:49:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][570/625] eta 0:00:32 lr 0.000580 wd 0.0500 time 0.5753 (0.5972) data time 0.0006 (0.0020) model time 0.5747 (0.5953) loss 7.8460 (7.4034) grad_norm 2.6234 (2.4387) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:49:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][580/625] eta 0:00:26 lr 0.000580 wd 0.0500 time 0.5738 (0.5968) data time 0.0008 (0.0020) model time 0.5730 (0.5948) loss 8.4060 (7.3999) grad_norm 1.7804 (2.4363) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:49:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][590/625] eta 0:00:20 lr 0.000580 wd 0.0500 time 0.5740 (0.5965) data time 0.0006 (0.0020) model time 0.5734 (0.5945) loss 8.0224 (7.4047) grad_norm 1.6747 (2.4347) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:49:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][600/625] eta 0:00:14 lr 0.000580 wd 0.0500 time 0.5705 (0.5961) data time 0.0008 (0.0019) model time 0.5697 (0.5941) loss 6.9398 (7.3985) grad_norm 1.7517 (2.4279) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:49:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][610/625] eta 0:00:08 lr 0.000580 wd 0.0500 time 0.5727 (0.5958) data time 0.0004 (0.0019) model time 0.5723 (0.5938) loss 5.7667 (7.3987) grad_norm 2.8504 (2.4252) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:49:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [163/300][620/625] eta 0:00:02 lr 0.000579 wd 0.0500 time 0.5708 (0.5955) data time 0.0006 (0.0019) model time 0.5702 (0.5935) loss 7.2456 (7.3967) grad_norm 3.6685 (2.4339) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:49:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 163 training takes 0:06:12 +[2024-07-25 06:49:55 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 06:49:56 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 06:50:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 5.212 (5.212) Loss 0.5205 (0.5205) Acc@1 89.209 (89.209) Acc@5 98.682 (98.682) Mem 22339MB +[2024-07-25 06:50:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.588) Loss 0.8237 (0.6505) Acc@1 80.615 (86.146) Acc@5 95.947 (97.665) Mem 22339MB +[2024-07-25 06:50:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.368) Loss 0.9248 (0.7541) Acc@1 78.174 (83.198) Acc@5 94.971 (96.601) Mem 22339MB +[2024-07-25 06:50:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.867 Acc@5 96.599 +[2024-07-25 06:50:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.9% +[2024-07-25 06:50:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.803 (0.803) Loss 0.4939 (0.4939) Acc@1 89.844 (89.844) Acc@5 98.828 (98.828) Mem 22339MB +[2024-07-25 06:50:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.188) Loss 0.7642 (0.6226) Acc@1 82.080 (86.847) Acc@5 96.484 (97.860) Mem 22339MB +[2024-07-25 06:50:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.158) Loss 0.8921 (0.7231) Acc@1 78.320 (83.759) Acc@5 95.801 (96.863) Mem 22339MB +[2024-07-25 06:50:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.345 Acc@5 96.871 +[2024-07-25 06:50:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.3% +[2024-07-25 06:50:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.35% +[2024-07-25 06:50:08 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 06:50:09 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 06:50:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][0/625] eta 0:08:46 lr 0.000579 wd 0.0500 time 0.8423 (0.8423) data time 0.3271 (0.3271) model time 0.0000 (0.0000) loss 6.9519 (6.9519) grad_norm 1.8539 (1.8539) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:50:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][10/625] eta 0:06:09 lr 0.000579 wd 0.0500 time 0.5692 (0.6007) data time 0.0006 (0.0307) model time 0.0000 (0.0000) loss 6.1204 (7.3667) grad_norm 2.5341 (2.7409) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:50:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][20/625] eta 0:05:56 lr 0.000579 wd 0.0500 time 0.5716 (0.5899) data time 0.0006 (0.0164) model time 0.0000 (0.0000) loss 7.1328 (7.3290) grad_norm 2.1754 (2.4979) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:50:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][30/625] eta 0:05:48 lr 0.000579 wd 0.0500 time 0.5727 (0.5852) data time 0.0006 (0.0114) model time 0.0000 (0.0000) loss 8.8126 (7.5221) grad_norm 1.9294 (2.3820) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:50:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][40/625] eta 0:05:40 lr 0.000579 wd 0.0500 time 0.5733 (0.5822) data time 0.0007 (0.0088) model time 0.0000 (0.0000) loss 8.4412 (7.5552) grad_norm 2.5656 (2.3497) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:50:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][50/625] eta 0:05:33 lr 0.000579 wd 0.0500 time 0.5734 (0.5805) data time 0.0008 (0.0072) model time 0.0000 (0.0000) loss 6.9416 (7.5129) grad_norm 2.6129 (2.3511) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:50:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][60/625] eta 0:05:27 lr 0.000579 wd 0.0500 time 0.5710 (0.5794) data time 0.0007 (0.0062) model time 0.5702 (0.5733) loss 9.1386 (7.4914) grad_norm 1.6866 (2.2957) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:50:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][70/625] eta 0:05:21 lr 0.000579 wd 0.0500 time 0.5725 (0.5788) data time 0.0006 (0.0054) model time 0.5719 (0.5738) loss 7.2771 (7.5516) grad_norm 3.1889 (2.3400) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:50:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][80/625] eta 0:05:16 lr 0.000578 wd 0.0500 time 0.6346 (0.5814) data time 0.0006 (0.0048) model time 0.6340 (0.5821) loss 7.1430 (7.5613) grad_norm 1.8894 (2.3916) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:51:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][90/625] eta 0:05:13 lr 0.000578 wd 0.0500 time 0.5727 (0.5852) data time 0.0008 (0.0044) model time 0.5719 (0.5905) loss 7.3516 (7.5853) grad_norm 2.3930 (2.4295) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:51:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][100/625] eta 0:05:11 lr 0.000578 wd 0.0500 time 0.7365 (0.5926) data time 0.0009 (0.0041) model time 0.7355 (0.6041) loss 8.9310 (7.5692) grad_norm 1.7591 (2.3903) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:51:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][110/625] eta 0:05:07 lr 0.000578 wd 0.0500 time 0.5611 (0.5966) data time 0.0006 (0.0038) model time 0.5604 (0.6094) loss 8.7922 (7.5665) grad_norm 1.7793 (2.3814) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:51:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][120/625] eta 0:05:00 lr 0.000578 wd 0.0500 time 0.5673 (0.5959) data time 0.0006 (0.0035) model time 0.5667 (0.6063) loss 7.7932 (7.5886) grad_norm 1.7294 (2.3597) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:51:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][130/625] eta 0:04:54 lr 0.000578 wd 0.0500 time 0.5677 (0.5954) data time 0.0006 (0.0034) model time 0.5670 (0.6040) loss 7.3933 (7.5395) grad_norm 2.0170 (2.3510) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 06:51:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][140/625] eta 0:04:48 lr 0.000578 wd 0.0500 time 0.5631 (0.5939) data time 0.0007 (0.0032) model time 0.5625 (0.6006) loss 7.4642 (7.5191) grad_norm 2.5653 (inf) loss_scale 1024.0000 (2011.6879) mem 22339MB +[2024-07-25 06:51:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][150/625] eta 0:04:41 lr 0.000578 wd 0.0500 time 0.5607 (0.5929) data time 0.0008 (0.0030) model time 0.5599 (0.5984) loss 6.5797 (7.5222) grad_norm 1.8216 (inf) loss_scale 1024.0000 (1946.2781) mem 22339MB +[2024-07-25 06:51:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][160/625] eta 0:04:35 lr 0.000578 wd 0.0500 time 0.5633 (0.5920) data time 0.0006 (0.0030) model time 0.5627 (0.5964) loss 6.9387 (7.5578) grad_norm 1.8349 (inf) loss_scale 1024.0000 (1888.9938) mem 22339MB +[2024-07-25 06:51:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][170/625] eta 0:04:28 lr 0.000578 wd 0.0500 time 0.5731 (0.5910) data time 0.0008 (0.0029) model time 0.5724 (0.5945) loss 6.8160 (7.5439) grad_norm 1.6055 (inf) loss_scale 1024.0000 (1838.4094) mem 22339MB +[2024-07-25 06:51:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][180/625] eta 0:04:22 lr 0.000577 wd 0.0500 time 0.5632 (0.5904) data time 0.0006 (0.0028) model time 0.5626 (0.5933) loss 6.9816 (7.5352) grad_norm 1.7430 (inf) loss_scale 1024.0000 (1793.4144) mem 22339MB +[2024-07-25 06:52:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][190/625] eta 0:04:16 lr 0.000577 wd 0.0500 time 0.5612 (0.5896) data time 0.0010 (0.0027) model time 0.5602 (0.5919) loss 8.2269 (7.5144) grad_norm 2.1384 (inf) loss_scale 1024.0000 (1753.1309) mem 22339MB +[2024-07-25 06:52:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][200/625] eta 0:04:10 lr 0.000577 wd 0.0500 time 0.5618 (0.5890) data time 0.0008 (0.0026) model time 0.5610 (0.5909) loss 8.9531 (7.5355) grad_norm 3.5969 (inf) loss_scale 1024.0000 (1716.8557) mem 22339MB +[2024-07-25 06:52:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][210/625] eta 0:04:04 lr 0.000577 wd 0.0500 time 0.5637 (0.5884) data time 0.0008 (0.0025) model time 0.5630 (0.5900) loss 8.2201 (7.5416) grad_norm 1.9073 (inf) loss_scale 1024.0000 (1684.0190) mem 22339MB +[2024-07-25 06:52:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][220/625] eta 0:03:58 lr 0.000577 wd 0.0500 time 0.5697 (0.5879) data time 0.0006 (0.0025) model time 0.5691 (0.5891) loss 8.5310 (7.5437) grad_norm 2.0323 (inf) loss_scale 1024.0000 (1654.1538) mem 22339MB +[2024-07-25 06:52:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][230/625] eta 0:03:52 lr 0.000577 wd 0.0500 time 0.5732 (0.5875) data time 0.0006 (0.0024) model time 0.5725 (0.5885) loss 6.3302 (7.5122) grad_norm 2.5549 (inf) loss_scale 1024.0000 (1626.8745) mem 22339MB +[2024-07-25 06:52:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][240/625] eta 0:03:46 lr 0.000577 wd 0.0500 time 0.5665 (0.5870) data time 0.0008 (0.0023) model time 0.5657 (0.5878) loss 7.6429 (7.5125) grad_norm 2.1429 (inf) loss_scale 1024.0000 (1601.8589) mem 22339MB +[2024-07-25 06:52:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][250/625] eta 0:03:39 lr 0.000577 wd 0.0500 time 0.5723 (0.5866) data time 0.0007 (0.0023) model time 0.5716 (0.5872) loss 10.6327 (7.5152) grad_norm 2.3359 (inf) loss_scale 1024.0000 (1578.8367) mem 22339MB +[2024-07-25 06:52:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][260/625] eta 0:03:33 lr 0.000577 wd 0.0500 time 0.5693 (0.5862) data time 0.0006 (0.0022) model time 0.5687 (0.5866) loss 7.8346 (7.4940) grad_norm 2.3325 (inf) loss_scale 1024.0000 (1557.5785) mem 22339MB +[2024-07-25 06:52:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][270/625] eta 0:03:27 lr 0.000576 wd 0.0500 time 0.5618 (0.5858) data time 0.0008 (0.0022) model time 0.5610 (0.5861) loss 7.6491 (7.4887) grad_norm 3.5978 (inf) loss_scale 1024.0000 (1537.8893) mem 22339MB +[2024-07-25 06:52:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][280/625] eta 0:03:21 lr 0.000576 wd 0.0500 time 0.5690 (0.5855) data time 0.0008 (0.0021) model time 0.5682 (0.5856) loss 7.0885 (7.4898) grad_norm 1.6105 (inf) loss_scale 1024.0000 (1519.6014) mem 22339MB +[2024-07-25 06:53:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][290/625] eta 0:03:16 lr 0.000576 wd 0.0500 time 0.5732 (0.5853) data time 0.0007 (0.0021) model time 0.5725 (0.5853) loss 6.3414 (7.4920) grad_norm 1.9263 (inf) loss_scale 1024.0000 (1502.5704) mem 22339MB +[2024-07-25 06:53:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][300/625] eta 0:03:10 lr 0.000576 wd 0.0500 time 0.7020 (0.5863) data time 0.0006 (0.0020) model time 0.7014 (0.5865) loss 8.8386 (7.4861) grad_norm 2.9580 (inf) loss_scale 1024.0000 (1486.6711) mem 22339MB +[2024-07-25 06:53:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][310/625] eta 0:03:04 lr 0.000576 wd 0.0500 time 0.5789 (0.5871) data time 0.0008 (0.0020) model time 0.5781 (0.5874) loss 8.2315 (7.4793) grad_norm 2.0119 (inf) loss_scale 1024.0000 (1471.7942) mem 22339MB +[2024-07-25 06:53:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][320/625] eta 0:02:59 lr 0.000576 wd 0.0500 time 0.5706 (0.5885) data time 0.0006 (0.0020) model time 0.5700 (0.5891) loss 7.3369 (7.4866) grad_norm 2.4301 (inf) loss_scale 1024.0000 (1457.8442) mem 22339MB +[2024-07-25 06:53:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][330/625] eta 0:02:53 lr 0.000576 wd 0.0500 time 0.5724 (0.5893) data time 0.0008 (0.0019) model time 0.5715 (0.5899) loss 5.6623 (7.4938) grad_norm 1.8773 (inf) loss_scale 1024.0000 (1444.7372) mem 22339MB +[2024-07-25 06:53:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][340/625] eta 0:02:47 lr 0.000576 wd 0.0500 time 0.5689 (0.5892) data time 0.0008 (0.0019) model time 0.5682 (0.5898) loss 7.6061 (7.4917) grad_norm 2.9236 (inf) loss_scale 1024.0000 (1432.3988) mem 22339MB +[2024-07-25 06:53:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][350/625] eta 0:02:41 lr 0.000576 wd 0.0500 time 0.5629 (0.5889) data time 0.0009 (0.0019) model time 0.5620 (0.5894) loss 6.9450 (7.4838) grad_norm 1.7959 (inf) loss_scale 1024.0000 (1420.7635) mem 22339MB +[2024-07-25 06:53:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][360/625] eta 0:02:35 lr 0.000576 wd 0.0500 time 0.5634 (0.5885) data time 0.0006 (0.0018) model time 0.5628 (0.5889) loss 5.7486 (7.4765) grad_norm 1.7335 (inf) loss_scale 1024.0000 (1409.7729) mem 22339MB +[2024-07-25 06:53:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][370/625] eta 0:02:30 lr 0.000575 wd 0.0500 time 0.5746 (0.5886) data time 0.0008 (0.0019) model time 0.5739 (0.5888) loss 7.2487 (7.4667) grad_norm 1.7553 (inf) loss_scale 1024.0000 (1399.3747) mem 22339MB +[2024-07-25 06:53:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][380/625] eta 0:02:24 lr 0.000575 wd 0.0500 time 0.5749 (0.5882) data time 0.0008 (0.0019) model time 0.5741 (0.5884) loss 6.6205 (7.4586) grad_norm 2.3616 (inf) loss_scale 1024.0000 (1389.5223) mem 22339MB +[2024-07-25 06:53:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][390/625] eta 0:02:18 lr 0.000575 wd 0.0500 time 0.5738 (0.5879) data time 0.0009 (0.0018) model time 0.5729 (0.5880) loss 6.6260 (7.4649) grad_norm 2.1028 (inf) loss_scale 1024.0000 (1380.1739) mem 22339MB +[2024-07-25 06:54:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][400/625] eta 0:02:12 lr 0.000575 wd 0.0500 time 0.5706 (0.5876) data time 0.0008 (0.0018) model time 0.5698 (0.5876) loss 7.5199 (7.4739) grad_norm 2.9116 (inf) loss_scale 1024.0000 (1371.2918) mem 22339MB +[2024-07-25 06:54:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][410/625] eta 0:02:06 lr 0.000575 wd 0.0500 time 0.5715 (0.5873) data time 0.0008 (0.0018) model time 0.5707 (0.5872) loss 7.9485 (7.4834) grad_norm 1.7473 (inf) loss_scale 1024.0000 (1362.8418) mem 22339MB +[2024-07-25 06:54:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][420/625] eta 0:02:00 lr 0.000575 wd 0.0500 time 0.5742 (0.5870) data time 0.0006 (0.0018) model time 0.5736 (0.5869) loss 6.2662 (7.4820) grad_norm 1.9342 (inf) loss_scale 1024.0000 (1354.7933) mem 22339MB +[2024-07-25 06:54:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][430/625] eta 0:01:54 lr 0.000575 wd 0.0500 time 0.5686 (0.5867) data time 0.0006 (0.0018) model time 0.5679 (0.5865) loss 6.7460 (7.4879) grad_norm 2.1203 (inf) loss_scale 1024.0000 (1347.1183) mem 22339MB +[2024-07-25 06:54:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][440/625] eta 0:01:48 lr 0.000575 wd 0.0500 time 0.5722 (0.5864) data time 0.0006 (0.0017) model time 0.5715 (0.5862) loss 5.8864 (7.4805) grad_norm 2.9922 (inf) loss_scale 1024.0000 (1339.7914) mem 22339MB +[2024-07-25 06:54:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][450/625] eta 0:01:42 lr 0.000575 wd 0.0500 time 0.5758 (0.5862) data time 0.0006 (0.0017) model time 0.5752 (0.5859) loss 6.5424 (7.4814) grad_norm 5.6856 (inf) loss_scale 1024.0000 (1332.7894) mem 22339MB +[2024-07-25 06:54:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][460/625] eta 0:01:36 lr 0.000574 wd 0.0500 time 0.5714 (0.5859) data time 0.0008 (0.0017) model time 0.5706 (0.5856) loss 7.3321 (7.4726) grad_norm 3.4039 (inf) loss_scale 1024.0000 (1326.0911) mem 22339MB +[2024-07-25 06:54:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][470/625] eta 0:01:30 lr 0.000574 wd 0.0500 time 0.5714 (0.5857) data time 0.0008 (0.0017) model time 0.5706 (0.5853) loss 8.9313 (7.4778) grad_norm 2.7278 (inf) loss_scale 1024.0000 (1319.6773) mem 22339MB +[2024-07-25 06:54:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][480/625] eta 0:01:24 lr 0.000574 wd 0.0500 time 0.5727 (0.5854) data time 0.0006 (0.0017) model time 0.5721 (0.5850) loss 7.3983 (7.4755) grad_norm 2.3910 (inf) loss_scale 1024.0000 (1313.5301) mem 22339MB +[2024-07-25 06:54:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][490/625] eta 0:01:19 lr 0.000574 wd 0.0500 time 0.5716 (0.5852) data time 0.0006 (0.0016) model time 0.5710 (0.5848) loss 5.8850 (7.4663) grad_norm 2.0926 (inf) loss_scale 1024.0000 (1307.6334) mem 22339MB +[2024-07-25 06:55:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][500/625] eta 0:01:13 lr 0.000574 wd 0.0500 time 0.5752 (0.5850) data time 0.0006 (0.0016) model time 0.5745 (0.5845) loss 5.8174 (7.4550) grad_norm 3.7291 (inf) loss_scale 1024.0000 (1301.9721) mem 22339MB +[2024-07-25 06:55:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][510/625] eta 0:01:07 lr 0.000574 wd 0.0500 time 0.7307 (0.5850) data time 0.0008 (0.0016) model time 0.7299 (0.5846) loss 8.2252 (7.4610) grad_norm 1.5383 (inf) loss_scale 1024.0000 (1296.5323) mem 22339MB +[2024-07-25 06:55:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][520/625] eta 0:01:01 lr 0.000574 wd 0.0500 time 0.5605 (0.5849) data time 0.0008 (0.0016) model time 0.5597 (0.5844) loss 8.3673 (7.4610) grad_norm 4.4268 (inf) loss_scale 1024.0000 (1291.3013) mem 22339MB +[2024-07-25 06:55:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][530/625] eta 0:00:55 lr 0.000574 wd 0.0500 time 0.5709 (0.5853) data time 0.0008 (0.0016) model time 0.5701 (0.5848) loss 7.1182 (7.4671) grad_norm 2.1078 (inf) loss_scale 1024.0000 (1286.2674) mem 22339MB +[2024-07-25 06:55:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][540/625] eta 0:00:49 lr 0.000574 wd 0.0500 time 0.7291 (0.5867) data time 0.0006 (0.0016) model time 0.7284 (0.5864) loss 7.3378 (7.4679) grad_norm 1.6556 (inf) loss_scale 1024.0000 (1281.4196) mem 22339MB +[2024-07-25 06:55:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][550/625] eta 0:00:44 lr 0.000573 wd 0.0500 time 0.7034 (0.5873) data time 0.0008 (0.0015) model time 0.7025 (0.5870) loss 6.4429 (7.4691) grad_norm 2.1014 (inf) loss_scale 1024.0000 (1276.7477) mem 22339MB +[2024-07-25 06:55:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][560/625] eta 0:00:38 lr 0.000573 wd 0.0500 time 0.5711 (0.5871) data time 0.0007 (0.0015) model time 0.5705 (0.5868) loss 8.2822 (7.4607) grad_norm 2.5347 (inf) loss_scale 1024.0000 (1272.2424) mem 22339MB +[2024-07-25 06:55:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][570/625] eta 0:00:32 lr 0.000573 wd 0.0500 time 0.5745 (0.5870) data time 0.0006 (0.0015) model time 0.5739 (0.5866) loss 7.0295 (7.4623) grad_norm 2.2593 (inf) loss_scale 1024.0000 (1267.8949) mem 22339MB +[2024-07-25 06:55:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][580/625] eta 0:00:26 lr 0.000573 wd 0.0500 time 0.5731 (0.5868) data time 0.0009 (0.0015) model time 0.5723 (0.5864) loss 7.6441 (7.4670) grad_norm 2.7229 (inf) loss_scale 1024.0000 (1263.6971) mem 22339MB +[2024-07-25 06:55:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][590/625] eta 0:00:20 lr 0.000573 wd 0.0500 time 0.5730 (0.5865) data time 0.0006 (0.0015) model time 0.5724 (0.5862) loss 7.8451 (7.4667) grad_norm 2.7704 (inf) loss_scale 1024.0000 (1259.6413) mem 22339MB +[2024-07-25 06:56:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][600/625] eta 0:00:14 lr 0.000573 wd 0.0500 time 0.5734 (0.5864) data time 0.0006 (0.0015) model time 0.5727 (0.5859) loss 5.9797 (7.4669) grad_norm 1.8704 (inf) loss_scale 1024.0000 (1255.7205) mem 22339MB +[2024-07-25 06:56:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][610/625] eta 0:00:08 lr 0.000573 wd 0.0500 time 0.5721 (0.5863) data time 0.0006 (0.0015) model time 0.5715 (0.5858) loss 7.4859 (7.4742) grad_norm 2.1377 (inf) loss_scale 1024.0000 (1251.9280) mem 22339MB +[2024-07-25 06:56:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [164/300][620/625] eta 0:00:02 lr 0.000573 wd 0.0500 time 0.5721 (0.5861) data time 0.0006 (0.0015) model time 0.5715 (0.5856) loss 7.1941 (7.4682) grad_norm 2.0777 (inf) loss_scale 1024.0000 (1248.2576) mem 22339MB +[2024-07-25 06:56:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 164 training takes 0:06:06 +[2024-07-25 06:56:16 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 06:56:17 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 06:56:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.479 (0.479) Loss 0.5142 (0.5142) Acc@1 89.111 (89.111) Acc@5 98.584 (98.584) Mem 22339MB +[2024-07-25 06:56:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.8096 (0.6386) Acc@1 81.641 (86.350) Acc@5 96.094 (97.745) Mem 22339MB +[2024-07-25 06:56:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8950 (0.7437) Acc@1 77.881 (83.303) Acc@5 95.947 (96.738) Mem 22339MB +[2024-07-25 06:56:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.945 Acc@5 96.697 +[2024-07-25 06:56:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.9% +[2024-07-25 06:56:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 82.95% +[2024-07-25 06:56:21 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 06:56:22 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 06:56:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.479 (0.479) Loss 0.4944 (0.4944) Acc@1 89.893 (89.893) Acc@5 98.828 (98.828) Mem 22339MB +[2024-07-25 06:56:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.158) Loss 0.7637 (0.6228) Acc@1 82.080 (86.905) Acc@5 96.484 (97.852) Mem 22339MB +[2024-07-25 06:56:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8916 (0.7228) Acc@1 78.467 (83.815) Acc@5 95.752 (96.856) Mem 22339MB +[2024-07-25 06:56:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.401 Acc@5 96.867 +[2024-07-25 06:56:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.4% +[2024-07-25 06:56:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.40% +[2024-07-25 06:56:26 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 06:56:27 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 06:56:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][0/625] eta 0:09:04 lr 0.000573 wd 0.0500 time 0.8720 (0.8720) data time 0.3560 (0.3560) model time 0.0000 (0.0000) loss 7.9316 (7.9316) grad_norm 2.9915 (2.9915) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:56:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][10/625] eta 0:06:09 lr 0.000573 wd 0.0500 time 0.5663 (0.6009) data time 0.0008 (0.0331) model time 0.0000 (0.0000) loss 6.9399 (7.3994) grad_norm 2.1351 (2.6693) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:56:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][20/625] eta 0:06:00 lr 0.000572 wd 0.0500 time 0.5722 (0.5958) data time 0.0007 (0.0177) model time 0.0000 (0.0000) loss 6.8795 (7.4396) grad_norm 1.9650 (2.4801) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:56:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][30/625] eta 0:05:50 lr 0.000572 wd 0.0500 time 0.5729 (0.5889) data time 0.0008 (0.0122) model time 0.0000 (0.0000) loss 8.3754 (7.4750) grad_norm 2.0642 (2.3999) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:56:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][40/625] eta 0:05:42 lr 0.000572 wd 0.0500 time 0.5731 (0.5855) data time 0.0008 (0.0094) model time 0.0000 (0.0000) loss 6.5965 (7.3528) grad_norm 2.3914 (2.3350) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:56:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][50/625] eta 0:05:35 lr 0.000572 wd 0.0500 time 0.5693 (0.5832) data time 0.0008 (0.0077) model time 0.0000 (0.0000) loss 5.5760 (7.3828) grad_norm 1.8185 (2.2694) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:57:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][60/625] eta 0:05:28 lr 0.000572 wd 0.0500 time 0.5678 (0.5818) data time 0.0008 (0.0066) model time 0.5670 (0.5736) loss 8.4336 (7.4420) grad_norm 1.8355 (2.2264) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:57:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][70/625] eta 0:05:22 lr 0.000572 wd 0.0500 time 0.5705 (0.5809) data time 0.0009 (0.0058) model time 0.5696 (0.5742) loss 7.5727 (7.4655) grad_norm 2.7552 (2.2371) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:57:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][80/625] eta 0:05:16 lr 0.000572 wd 0.0500 time 0.5727 (0.5800) data time 0.0008 (0.0052) model time 0.5718 (0.5738) loss 7.0941 (7.4691) grad_norm 2.6141 (2.3044) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:57:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][90/625] eta 0:05:10 lr 0.000572 wd 0.0500 time 0.5686 (0.5795) data time 0.0006 (0.0047) model time 0.5680 (0.5739) loss 7.0026 (7.5261) grad_norm 1.8305 (2.3478) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:57:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][100/625] eta 0:05:03 lr 0.000572 wd 0.0500 time 0.5734 (0.5790) data time 0.0008 (0.0043) model time 0.5725 (0.5739) loss 6.4556 (7.4982) grad_norm 1.9359 (2.3628) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:57:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][110/625] eta 0:04:58 lr 0.000572 wd 0.0500 time 0.5741 (0.5799) data time 0.0008 (0.0040) model time 0.5733 (0.5763) loss 7.9309 (7.4757) grad_norm 2.2828 (2.3527) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:57:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][120/625] eta 0:04:54 lr 0.000571 wd 0.0500 time 0.7259 (0.5823) data time 0.0007 (0.0037) model time 0.7252 (0.5807) loss 7.9060 (7.4511) grad_norm 1.8162 (2.3186) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:57:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][130/625] eta 0:04:50 lr 0.000571 wd 0.0500 time 0.7466 (0.5862) data time 0.0008 (0.0035) model time 0.7458 (0.5874) loss 7.0248 (7.4880) grad_norm 1.6207 (2.2850) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:57:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][140/625] eta 0:04:45 lr 0.000571 wd 0.0500 time 0.5722 (0.5892) data time 0.0008 (0.0033) model time 0.5713 (0.5919) loss 8.9722 (7.5090) grad_norm 1.7664 (2.2825) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:57:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][150/625] eta 0:04:41 lr 0.000571 wd 0.0500 time 0.6065 (0.5919) data time 0.0007 (0.0031) model time 0.6058 (0.5955) loss 9.3932 (7.5254) grad_norm 1.5490 (2.2742) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:58:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][160/625] eta 0:04:35 lr 0.000571 wd 0.0500 time 0.6925 (0.5916) data time 0.0009 (0.0030) model time 0.6917 (0.5946) loss 8.7630 (7.5242) grad_norm 1.6638 (2.2568) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:58:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][170/625] eta 0:04:28 lr 0.000571 wd 0.0500 time 0.5707 (0.5905) data time 0.0008 (0.0029) model time 0.5700 (0.5929) loss 7.3229 (7.5245) grad_norm 2.7594 (2.2547) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:58:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][180/625] eta 0:04:22 lr 0.000571 wd 0.0500 time 0.5738 (0.5897) data time 0.0008 (0.0028) model time 0.5730 (0.5914) loss 7.3497 (7.5130) grad_norm 3.7810 (2.2710) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:58:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][190/625] eta 0:04:16 lr 0.000571 wd 0.0500 time 0.5713 (0.5889) data time 0.0006 (0.0026) model time 0.5707 (0.5902) loss 6.5671 (7.4972) grad_norm 1.8002 (2.2980) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:58:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][200/625] eta 0:04:09 lr 0.000571 wd 0.0500 time 0.5718 (0.5881) data time 0.0007 (0.0026) model time 0.5711 (0.5890) loss 7.4770 (7.4875) grad_norm 3.0195 (2.3099) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:58:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][210/625] eta 0:04:03 lr 0.000570 wd 0.0500 time 0.5708 (0.5875) data time 0.0006 (0.0025) model time 0.5702 (0.5880) loss 7.5934 (7.4841) grad_norm 1.9266 (2.2974) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:58:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][220/625] eta 0:03:57 lr 0.000570 wd 0.0500 time 0.5732 (0.5870) data time 0.0006 (0.0024) model time 0.5726 (0.5873) loss 6.2331 (7.4697) grad_norm 3.5138 (2.3111) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:58:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][230/625] eta 0:03:51 lr 0.000570 wd 0.0500 time 0.5728 (0.5865) data time 0.0006 (0.0023) model time 0.5722 (0.5866) loss 9.0146 (7.4899) grad_norm 2.2886 (2.3283) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:58:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][240/625] eta 0:03:45 lr 0.000570 wd 0.0500 time 0.5745 (0.5862) data time 0.0006 (0.0023) model time 0.5739 (0.5862) loss 7.6776 (7.4876) grad_norm 1.8961 (2.3281) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:58:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][250/625] eta 0:03:39 lr 0.000570 wd 0.0500 time 0.5763 (0.5858) data time 0.0008 (0.0022) model time 0.5754 (0.5857) loss 7.1146 (7.4593) grad_norm 2.8598 (2.3169) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:59:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][260/625] eta 0:03:33 lr 0.000570 wd 0.0500 time 0.5754 (0.5854) data time 0.0006 (0.0021) model time 0.5748 (0.5851) loss 8.6723 (7.4601) grad_norm 1.9180 (2.3016) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:59:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][270/625] eta 0:03:27 lr 0.000570 wd 0.0500 time 0.5764 (0.5850) data time 0.0008 (0.0021) model time 0.5756 (0.5846) loss 8.2855 (7.4652) grad_norm 1.7146 (2.2959) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:59:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][280/625] eta 0:03:21 lr 0.000570 wd 0.0500 time 0.5718 (0.5846) data time 0.0008 (0.0021) model time 0.5710 (0.5841) loss 8.7085 (7.4732) grad_norm 1.6321 (2.2973) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:59:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][290/625] eta 0:03:15 lr 0.000570 wd 0.0500 time 0.5787 (0.5843) data time 0.0006 (0.0020) model time 0.5781 (0.5837) loss 5.8744 (7.4707) grad_norm 2.1203 (2.2991) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:59:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][300/625] eta 0:03:09 lr 0.000570 wd 0.0500 time 0.5735 (0.5840) data time 0.0008 (0.0020) model time 0.5727 (0.5834) loss 7.1398 (7.4697) grad_norm 4.7212 (2.3072) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:59:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][310/625] eta 0:03:03 lr 0.000569 wd 0.0500 time 0.5726 (0.5837) data time 0.0006 (0.0019) model time 0.5720 (0.5830) loss 7.2398 (7.4776) grad_norm 2.2232 (2.3227) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:59:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][320/625] eta 0:02:57 lr 0.000569 wd 0.0500 time 0.5752 (0.5835) data time 0.0006 (0.0019) model time 0.5746 (0.5827) loss 5.6936 (7.4486) grad_norm 2.7446 (2.3389) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:59:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][330/625] eta 0:02:52 lr 0.000569 wd 0.0500 time 0.5741 (0.5836) data time 0.0007 (0.0019) model time 0.5734 (0.5829) loss 6.0602 (7.4404) grad_norm 3.1256 (2.3454) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:59:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][340/625] eta 0:02:46 lr 0.000569 wd 0.0500 time 0.5747 (0.5841) data time 0.0008 (0.0018) model time 0.5739 (0.5835) loss 6.5084 (7.4437) grad_norm 2.1141 (2.3417) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:59:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][350/625] eta 0:02:40 lr 0.000569 wd 0.0500 time 0.7332 (0.5850) data time 0.0008 (0.0018) model time 0.7324 (0.5845) loss 7.2667 (7.4455) grad_norm 2.6716 (2.3450) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 06:59:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][360/625] eta 0:02:35 lr 0.000569 wd 0.0500 time 0.6007 (0.5867) data time 0.0006 (0.0018) model time 0.6001 (0.5865) loss 8.3188 (7.4519) grad_norm 1.6765 (2.3519) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:00:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][370/625] eta 0:02:29 lr 0.000569 wd 0.0500 time 0.5673 (0.5877) data time 0.0006 (0.0018) model time 0.5666 (0.5876) loss 5.7546 (7.4495) grad_norm 1.9079 (2.3412) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:00:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][380/625] eta 0:02:23 lr 0.000569 wd 0.0500 time 0.5727 (0.5876) data time 0.0007 (0.0017) model time 0.5720 (0.5875) loss 7.9225 (7.4529) grad_norm 4.6408 (2.3514) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:00:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][390/625] eta 0:02:18 lr 0.000569 wd 0.0500 time 0.5745 (0.5873) data time 0.0007 (0.0017) model time 0.5739 (0.5871) loss 6.3636 (7.4568) grad_norm 2.5323 (2.3699) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:00:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][400/625] eta 0:02:12 lr 0.000568 wd 0.0500 time 0.5737 (0.5870) data time 0.0006 (0.0017) model time 0.5731 (0.5868) loss 8.3765 (7.4680) grad_norm 1.6709 (2.3609) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:00:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][410/625] eta 0:02:06 lr 0.000568 wd 0.0500 time 0.5581 (0.5866) data time 0.0009 (0.0017) model time 0.5572 (0.5863) loss 8.5141 (7.4665) grad_norm 2.4664 (2.3558) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:00:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][420/625] eta 0:02:00 lr 0.000568 wd 0.0500 time 0.5676 (0.5865) data time 0.0008 (0.0016) model time 0.5668 (0.5862) loss 6.1196 (7.4587) grad_norm 16.3462 (2.3833) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:00:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][430/625] eta 0:01:54 lr 0.000568 wd 0.0500 time 0.5761 (0.5863) data time 0.0006 (0.0016) model time 0.5755 (0.5859) loss 7.3360 (7.4537) grad_norm 1.9681 (2.3829) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:00:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][440/625] eta 0:01:48 lr 0.000568 wd 0.0500 time 0.5682 (0.5860) data time 0.0008 (0.0016) model time 0.5675 (0.5856) loss 7.5403 (7.4623) grad_norm 2.3592 (2.3780) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:00:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][450/625] eta 0:01:42 lr 0.000568 wd 0.0500 time 0.5743 (0.5858) data time 0.0006 (0.0016) model time 0.5737 (0.5853) loss 6.7554 (7.4556) grad_norm 2.2358 (2.3750) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:01:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][460/625] eta 0:01:40 lr 0.000568 wd 0.0500 time 0.5703 (0.6071) data time 0.0006 (0.0023) model time 0.5698 (0.6084) loss 6.3380 (7.4496) grad_norm 2.3463 (2.3928) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:01:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][470/625] eta 0:01:35 lr 0.000568 wd 0.0500 time 0.5677 (0.6172) data time 0.0008 (0.0023) model time 0.5669 (0.6196) loss 7.7079 (7.4510) grad_norm 2.0367 (2.3911) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:01:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][480/625] eta 0:01:31 lr 0.000568 wd 0.0500 time 1.0154 (0.6296) data time 0.0008 (0.0025) model time 1.0146 (0.6332) loss 7.7872 (7.4572) grad_norm 2.7758 (2.3858) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:01:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][490/625] eta 0:01:26 lr 0.000567 wd 0.0500 time 0.9406 (0.6377) data time 0.0008 (0.0025) model time 0.9398 (0.6422) loss 7.2906 (7.4611) grad_norm 1.8594 (2.3802) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:01:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][500/625] eta 0:01:21 lr 0.000567 wd 0.0500 time 0.5669 (0.6513) data time 0.0005 (0.0025) model time 0.5664 (0.6572) loss 8.0507 (7.4574) grad_norm 1.8672 (2.3803) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:01:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][510/625] eta 0:01:14 lr 0.000567 wd 0.0500 time 0.5677 (0.6498) data time 0.0008 (0.0024) model time 0.5669 (0.6553) loss 7.7629 (7.4524) grad_norm 2.4933 (2.3807) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:02:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][520/625] eta 0:01:08 lr 0.000567 wd 0.0500 time 0.5747 (0.6483) data time 0.0006 (0.0024) model time 0.5741 (0.6535) loss 7.0030 (7.4467) grad_norm 2.2977 (2.3907) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:02:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][530/625] eta 0:01:01 lr 0.000567 wd 0.0500 time 2.6506 (0.6507) data time 0.0007 (0.0024) model time 2.6498 (0.6561) loss 9.3694 (7.4521) grad_norm 4.4461 (2.4050) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:02:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][540/625] eta 0:00:56 lr 0.000567 wd 0.0500 time 0.6380 (0.6630) data time 0.0008 (0.0023) model time 0.6372 (0.6695) loss 7.4572 (7.4519) grad_norm 2.1892 (2.4104) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:02:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][550/625] eta 0:00:51 lr 0.000567 wd 0.0500 time 1.4305 (0.6820) data time 0.0009 (0.0023) model time 1.4296 (0.6903) loss 7.8214 (7.4563) grad_norm 2.7212 (2.4090) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:02:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][560/625] eta 0:00:44 lr 0.000567 wd 0.0500 time 1.2073 (0.6882) data time 0.0008 (0.0023) model time 1.2065 (0.6970) loss 7.2556 (7.4575) grad_norm 2.7631 (2.4133) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:03:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][570/625] eta 0:00:38 lr 0.000567 wd 0.0500 time 1.2429 (0.6958) data time 0.5059 (0.0031) model time 0.7370 (0.7041) loss 7.0595 (7.4571) grad_norm 4.0049 (2.4163) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:03:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][580/625] eta 0:00:31 lr 0.000567 wd 0.0500 time 2.1675 (0.7049) data time 0.0008 (0.0045) model time 2.1667 (0.7124) loss 7.5782 (7.4472) grad_norm 3.8347 (2.4247) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:03:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][590/625] eta 0:00:24 lr 0.000566 wd 0.0500 time 1.5186 (0.7119) data time 0.0006 (0.0045) model time 1.5180 (0.7199) loss 9.5171 (7.4495) grad_norm 2.7934 (2.4313) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:03:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][600/625] eta 0:00:17 lr 0.000566 wd 0.0500 time 1.6069 (0.7182) data time 0.0006 (0.0053) model time 1.6063 (0.7256) loss 8.4930 (7.4606) grad_norm 2.2131 (2.4273) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:03:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][610/625] eta 0:00:10 lr 0.000566 wd 0.0500 time 1.1216 (0.7249) data time 0.0004 (0.0053) model time 1.1212 (0.7327) loss 6.4289 (7.4600) grad_norm 5.1688 (2.4297) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:03:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [165/300][620/625] eta 0:00:03 lr 0.000566 wd 0.0500 time 0.5724 (0.7254) data time 0.0004 (0.0058) model time 0.5720 (0.7325) loss 6.6953 (7.4653) grad_norm 2.6048 (2.4290) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:04:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 165 training takes 0:07:32 +[2024-07-25 07:04:00 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 07:04:01 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 07:04:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 20.090 (20.090) Loss 0.5381 (0.5381) Acc@1 89.844 (89.844) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 07:04:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.123 (2.082) Loss 0.8086 (0.6621) Acc@1 82.031 (86.577) Acc@5 96.338 (97.705) Mem 22339MB +[2024-07-25 07:04:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.124 (1.149) Loss 0.9204 (0.7648) Acc@1 78.320 (83.450) Acc@5 95.215 (96.666) Mem 22339MB +[2024-07-25 07:04:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.077 Acc@5 96.655 +[2024-07-25 07:04:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.1% +[2024-07-25 07:04:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 83.08% +[2024-07-25 07:04:26 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 07:04:27 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 07:04:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 22.248 (22.248) Loss 0.4941 (0.4941) Acc@1 89.893 (89.893) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 07:04:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.123 (2.265) Loss 0.7632 (0.6228) Acc@1 82.227 (86.887) Acc@5 96.484 (97.860) Mem 22339MB +[2024-07-25 07:04:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.124 (1.245) Loss 0.8906 (0.7226) Acc@1 78.467 (83.822) Acc@5 95.801 (96.870) Mem 22339MB +[2024-07-25 07:04:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.411 Acc@5 96.875 +[2024-07-25 07:04:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.4% +[2024-07-25 07:04:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.41% +[2024-07-25 07:04:54 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 07:04:55 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 07:05:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][0/625] eta 2:44:15 lr 0.000566 wd 0.0500 time 15.7681 (15.7681) data time 11.9661 (11.9661) model time 0.0000 (0.0000) loss 9.0303 (9.0303) grad_norm 2.0956 (2.0956) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:05:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][10/625] eta 0:22:56 lr 0.000566 wd 0.0500 time 0.5702 (2.2385) data time 0.0006 (1.0885) model time 0.0000 (0.0000) loss 5.9981 (7.6157) grad_norm 2.2970 (2.2941) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:05:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][20/625] eta 0:20:42 lr 0.000566 wd 0.0500 time 3.7001 (2.0532) data time 0.0006 (0.5706) model time 0.0000 (0.0000) loss 8.8273 (7.4870) grad_norm 3.1336 (2.2038) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:05:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][30/625] eta 0:15:46 lr 0.000566 wd 0.0500 time 0.5731 (1.5903) data time 0.0007 (0.3868) model time 0.0000 (0.0000) loss 8.1183 (7.5678) grad_norm 1.9982 (2.3612) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:05:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][40/625] eta 0:13:04 lr 0.000566 wd 0.0500 time 0.5731 (1.3413) data time 0.0008 (0.2926) model time 0.0000 (0.0000) loss 7.7931 (7.5589) grad_norm 2.1691 (2.3250) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:05:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][50/625] eta 0:12:01 lr 0.000566 wd 0.0500 time 0.5665 (1.2541) data time 0.0008 (0.2354) model time 0.0000 (0.0000) loss 8.5730 (7.5501) grad_norm 1.7158 (2.2461) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:06:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][60/625] eta 0:10:45 lr 0.000565 wd 0.0500 time 0.5680 (1.1421) data time 0.0007 (0.1970) model time 0.5673 (0.5700) loss 5.9580 (7.5629) grad_norm 2.2528 (2.2063) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:06:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][70/625] eta 0:09:49 lr 0.000565 wd 0.0500 time 0.5774 (1.0620) data time 0.0006 (0.1693) model time 0.5768 (0.5711) loss 6.7638 (7.5051) grad_norm 2.5210 (2.2082) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:06:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][80/625] eta 0:09:06 lr 0.000565 wd 0.0500 time 0.5795 (1.0020) data time 0.0008 (0.1485) model time 0.5787 (0.5726) loss 8.9974 (7.4836) grad_norm 3.6063 (2.2103) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:06:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][90/625] eta 0:08:30 lr 0.000565 wd 0.0500 time 0.5750 (0.9547) data time 0.0007 (0.1323) model time 0.5744 (0.5721) loss 6.1447 (7.4473) grad_norm 1.5916 (2.1886) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:06:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][100/625] eta 0:08:01 lr 0.000565 wd 0.0500 time 0.5721 (0.9173) data time 0.0008 (0.1193) model time 0.5713 (0.5729) loss 6.7697 (7.4866) grad_norm 1.6107 (2.1762) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:06:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][110/625] eta 0:07:53 lr 0.000565 wd 0.0500 time 4.2464 (0.9194) data time 0.0008 (0.1086) model time 4.2456 (0.6341) loss 7.2058 (7.4748) grad_norm 2.6511 (2.1865) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:06:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][120/625] eta 0:08:15 lr 0.000565 wd 0.0500 time 3.4031 (0.9819) data time 0.0006 (0.1096) model time 3.4025 (0.7656) loss 7.9629 (7.4762) grad_norm 2.3628 (2.2089) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:07:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][130/625] eta 0:08:28 lr 0.000565 wd 0.0500 time 0.5669 (1.0279) data time 0.0007 (0.1013) model time 0.5662 (0.8679) loss 7.1714 (7.4726) grad_norm 2.6301 (2.2219) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:07:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][140/625] eta 0:08:23 lr 0.000565 wd 0.0500 time 1.0000 (1.0372) data time 0.0008 (0.0942) model time 0.9992 (0.9001) loss 8.2652 (7.4886) grad_norm 2.0618 (2.2296) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:07:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][150/625] eta 0:08:11 lr 0.000564 wd 0.0500 time 1.0128 (1.0353) data time 0.0006 (0.0880) model time 1.0122 (0.9109) loss 6.4874 (7.4876) grad_norm 2.7366 (2.2173) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:07:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][160/625] eta 0:07:54 lr 0.000564 wd 0.0500 time 1.0423 (1.0207) data time 0.0007 (0.0826) model time 1.0416 (0.9008) loss 8.3449 (7.4925) grad_norm 1.7600 (2.2323) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:07:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][170/625] eta 0:07:41 lr 0.000564 wd 0.0500 time 0.7949 (1.0144) data time 0.0006 (0.0778) model time 0.7943 (0.9017) loss 7.0280 (7.4733) grad_norm 3.5118 (2.2773) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:08:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][180/625] eta 0:07:36 lr 0.000564 wd 0.0500 time 1.1297 (1.0262) data time 0.0008 (0.0743) model time 1.1290 (0.9258) loss 6.0132 (7.4578) grad_norm 1.9147 (2.3307) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:08:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][190/625] eta 0:07:24 lr 0.000564 wd 0.0500 time 1.0422 (1.0225) data time 0.0007 (0.0704) model time 1.0414 (0.9277) loss 6.0424 (7.4730) grad_norm 2.0270 (2.3785) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:08:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][200/625] eta 0:07:15 lr 0.000564 wd 0.0500 time 1.1973 (1.0258) data time 0.0007 (0.0670) model time 1.1966 (0.9385) loss 8.2470 (7.4845) grad_norm 2.3757 (2.4130) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:08:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][210/625] eta 0:07:05 lr 0.000564 wd 0.0500 time 1.1127 (1.0263) data time 0.0007 (0.0639) model time 1.1120 (0.9445) loss 7.0511 (7.4936) grad_norm 2.0438 (2.4060) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:08:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][220/625] eta 0:06:55 lr 0.000564 wd 0.0500 time 1.2561 (1.0253) data time 0.0008 (0.0610) model time 1.2553 (0.9480) loss 8.4187 (7.5180) grad_norm 2.4040 (2.3981) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:08:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][230/625] eta 0:06:45 lr 0.000564 wd 0.0500 time 0.6529 (1.0276) data time 0.0009 (0.0603) model time 0.6520 (0.9527) loss 7.1683 (7.5129) grad_norm 2.0323 (2.4010) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:09:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][240/625] eta 0:06:33 lr 0.000563 wd 0.0500 time 0.8290 (1.0214) data time 0.0008 (0.0590) model time 0.8282 (0.9473) loss 7.0658 (7.4975) grad_norm 3.2587 (2.3987) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:09:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][250/625] eta 0:06:27 lr 0.000563 wd 0.0500 time 0.6363 (1.0326) data time 0.0006 (0.0567) model time 0.6357 (0.9650) loss 7.5412 (7.5121) grad_norm 2.4980 (2.3977) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:09:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][260/625] eta 0:06:18 lr 0.000563 wd 0.0500 time 1.0595 (1.0383) data time 0.0008 (0.0545) model time 1.0586 (0.9752) loss 7.6234 (7.5287) grad_norm 2.9205 (2.4107) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:09:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][270/625] eta 0:06:09 lr 0.000563 wd 0.0500 time 1.3645 (1.0396) data time 0.0006 (0.0526) model time 1.3639 (0.9797) loss 7.6215 (7.5206) grad_norm 2.2053 (2.4250) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:09:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][280/625] eta 0:05:58 lr 0.000563 wd 0.0500 time 0.7952 (1.0392) data time 0.0006 (0.0507) model time 0.7945 (0.9818) loss 7.2186 (7.5237) grad_norm 2.7497 (2.4236) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:10:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][290/625] eta 0:05:50 lr 0.000563 wd 0.0500 time 1.2159 (1.0469) data time 0.0006 (0.0490) model time 1.2153 (0.9935) loss 7.8874 (7.5312) grad_norm 4.2746 (2.4268) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:10:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][300/625] eta 0:05:47 lr 0.000563 wd 0.0500 time 0.5790 (1.0681) data time 0.0009 (0.0474) model time 0.5781 (1.0211) loss 6.7873 (7.5392) grad_norm 2.4349 (2.4397) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:10:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][310/625] eta 0:05:41 lr 0.000563 wd 0.0500 time 1.4495 (1.0836) data time 0.0006 (0.0459) model time 1.4489 (1.0415) loss 6.5995 (7.5198) grad_norm 2.5468 (2.4366) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:10:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][320/625] eta 0:05:27 lr 0.000563 wd 0.0500 time 0.5712 (1.0723) data time 0.0008 (0.0445) model time 0.5704 (1.0295) loss 7.3442 (7.5038) grad_norm 1.7287 (2.4324) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:10:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][330/625] eta 0:05:11 lr 0.000563 wd 0.0500 time 0.5719 (1.0572) data time 0.0008 (0.0432) model time 0.5712 (1.0131) loss 8.3891 (7.5095) grad_norm 3.7967 (2.4337) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:10:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][340/625] eta 0:04:57 lr 0.000562 wd 0.0500 time 0.5740 (1.0429) data time 0.0006 (0.0419) model time 0.5734 (0.9979) loss 6.4381 (7.4972) grad_norm 4.6251 (2.4543) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:10:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][350/625] eta 0:04:43 lr 0.000562 wd 0.0500 time 0.5766 (1.0295) data time 0.0008 (0.0408) model time 0.5758 (0.9836) loss 8.0688 (7.4957) grad_norm 3.3320 (2.4752) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:11:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][360/625] eta 0:04:29 lr 0.000562 wd 0.0500 time 0.5716 (1.0169) data time 0.0006 (0.0397) model time 0.5710 (0.9704) loss 7.5925 (7.4926) grad_norm 2.1749 (2.4694) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:11:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][370/625] eta 0:04:16 lr 0.000562 wd 0.0500 time 0.7334 (1.0055) data time 0.0006 (0.0386) model time 0.7329 (0.9586) loss 7.8168 (7.4964) grad_norm 1.7197 (2.4589) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:11:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][380/625] eta 0:04:03 lr 0.000562 wd 0.0500 time 0.5708 (0.9953) data time 0.0006 (0.0376) model time 0.5702 (0.9483) loss 7.1895 (7.5079) grad_norm 1.6434 (2.4496) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:11:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][390/625] eta 0:03:51 lr 0.000562 wd 0.0500 time 0.7453 (0.9865) data time 0.0007 (0.0367) model time 0.7446 (0.9395) loss 7.9725 (7.4979) grad_norm 2.1378 (2.4385) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:11:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][400/625] eta 0:03:39 lr 0.000562 wd 0.0500 time 0.5729 (0.9772) data time 0.0006 (0.0358) model time 0.5723 (0.9302) loss 6.9584 (7.5010) grad_norm 1.7407 (2.4275) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:11:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][410/625] eta 0:03:28 lr 0.000562 wd 0.0500 time 0.6708 (0.9683) data time 0.0008 (0.0349) model time 0.6700 (0.9213) loss 7.8657 (7.5025) grad_norm 1.9153 (2.4142) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:11:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][420/625] eta 0:03:16 lr 0.000562 wd 0.0500 time 0.5736 (0.9590) data time 0.0007 (0.0341) model time 0.5730 (0.9120) loss 6.7739 (7.4992) grad_norm 2.7285 (2.4022) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:11:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][430/625] eta 0:03:05 lr 0.000561 wd 0.0500 time 0.5701 (0.9501) data time 0.0007 (0.0333) model time 0.5695 (0.9031) loss 7.8383 (7.4916) grad_norm 2.6899 (2.4166) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:11:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][440/625] eta 0:02:54 lr 0.000561 wd 0.0500 time 0.5741 (0.9416) data time 0.0008 (0.0326) model time 0.5733 (0.8946) loss 6.7062 (7.5034) grad_norm 1.7880 (2.4128) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:11:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][450/625] eta 0:02:43 lr 0.000561 wd 0.0500 time 0.5750 (0.9334) data time 0.0007 (0.0319) model time 0.5743 (0.8866) loss 7.6238 (7.5071) grad_norm 2.1799 (2.4143) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:12:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][460/625] eta 0:02:32 lr 0.000561 wd 0.0500 time 0.5974 (0.9257) data time 0.0006 (0.0312) model time 0.5967 (0.8790) loss 8.0867 (7.5052) grad_norm 2.6730 (2.4067) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:12:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][470/625] eta 0:02:22 lr 0.000561 wd 0.0500 time 0.5752 (0.9182) data time 0.0006 (0.0306) model time 0.5746 (0.8717) loss 6.3911 (7.4896) grad_norm 2.0745 (2.4060) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:12:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][480/625] eta 0:02:12 lr 0.000561 wd 0.0500 time 0.5764 (0.9110) data time 0.0008 (0.0300) model time 0.5756 (0.8647) loss 6.9913 (7.4906) grad_norm 2.8989 (2.4082) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:12:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][490/625] eta 0:02:02 lr 0.000561 wd 0.0500 time 0.5741 (0.9041) data time 0.0007 (0.0294) model time 0.5735 (0.8581) loss 6.6876 (7.4951) grad_norm 2.3530 (2.4076) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:12:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][500/625] eta 0:01:52 lr 0.000561 wd 0.0500 time 0.5756 (0.8975) data time 0.0006 (0.0288) model time 0.5750 (0.8517) loss 6.8894 (7.4892) grad_norm 2.8659 (2.4056) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:12:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][510/625] eta 0:01:42 lr 0.000561 wd 0.0500 time 0.5766 (0.8912) data time 0.0008 (0.0282) model time 0.5758 (0.8457) loss 7.4780 (7.4903) grad_norm 1.8512 (2.4001) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:12:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][520/625] eta 0:01:32 lr 0.000561 wd 0.0500 time 0.5723 (0.8851) data time 0.0006 (0.0277) model time 0.5717 (0.8399) loss 6.6864 (7.4916) grad_norm 2.1490 (2.3931) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:12:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][530/625] eta 0:01:23 lr 0.000560 wd 0.0500 time 0.5753 (0.8793) data time 0.0007 (0.0272) model time 0.5746 (0.8344) loss 7.7438 (7.4947) grad_norm 3.0537 (2.3840) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:12:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][540/625] eta 0:01:14 lr 0.000560 wd 0.0500 time 0.5758 (0.8736) data time 0.0008 (0.0267) model time 0.5750 (0.8290) loss 6.6018 (7.4952) grad_norm 2.0460 (2.3826) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:12:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][550/625] eta 0:01:05 lr 0.000560 wd 0.0500 time 0.5742 (0.8682) data time 0.0006 (0.0263) model time 0.5736 (0.8240) loss 7.0060 (7.4917) grad_norm 1.9763 (2.3817) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:12:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][560/625] eta 0:00:56 lr 0.000560 wd 0.0500 time 0.5843 (0.8630) data time 0.0008 (0.0258) model time 0.5835 (0.8191) loss 7.6010 (7.4913) grad_norm 3.4052 (2.3877) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:13:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][570/625] eta 0:00:47 lr 0.000560 wd 0.0500 time 0.5779 (0.8580) data time 0.0008 (0.0254) model time 0.5771 (0.8144) loss 6.9949 (7.4852) grad_norm 1.7207 (2.3845) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:13:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][580/625] eta 0:00:38 lr 0.000560 wd 0.0500 time 0.5792 (0.8531) data time 0.0006 (0.0249) model time 0.5786 (0.8099) loss 6.5891 (7.4783) grad_norm 3.5284 (2.3921) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:13:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][590/625] eta 0:00:29 lr 0.000560 wd 0.0500 time 0.5783 (0.8485) data time 0.0006 (0.0245) model time 0.5777 (0.8056) loss 6.4034 (7.4743) grad_norm 2.5078 (2.3925) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:13:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][600/625] eta 0:00:21 lr 0.000560 wd 0.0500 time 0.5720 (0.8451) data time 0.0008 (0.0241) model time 0.5711 (0.8026) loss 8.3569 (7.4641) grad_norm 3.0787 (2.3881) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:13:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][610/625] eta 0:00:12 lr 0.000560 wd 0.0500 time 0.5737 (0.8417) data time 0.0004 (0.0238) model time 0.5733 (0.7996) loss 6.3402 (7.4702) grad_norm 2.7962 (2.4046) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:13:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [166/300][620/625] eta 0:00:04 lr 0.000559 wd 0.0500 time 0.7344 (0.8388) data time 0.0004 (0.0234) model time 0.7340 (0.7973) loss 8.9821 (7.4704) grad_norm 2.7510 (2.4204) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:13:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 166 training takes 0:08:43 +[2024-07-25 07:13:38 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 07:13:40 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 07:13:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.467 (0.467) Loss 0.5273 (0.5273) Acc@1 89.600 (89.600) Acc@5 98.584 (98.584) Mem 22339MB +[2024-07-25 07:13:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.124 (0.157) Loss 0.8252 (0.6521) Acc@1 80.811 (86.457) Acc@5 96.436 (97.683) Mem 22339MB +[2024-07-25 07:13:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9243 (0.7565) Acc@1 77.441 (83.471) Acc@5 95.410 (96.596) Mem 22339MB +[2024-07-25 07:13:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.059 Acc@5 96.603 +[2024-07-25 07:13:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.1% +[2024-07-25 07:13:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.752 (0.752) Loss 0.4937 (0.4937) Acc@1 89.893 (89.893) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 07:13:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.183) Loss 0.7622 (0.6225) Acc@1 82.324 (86.914) Acc@5 96.436 (97.852) Mem 22339MB +[2024-07-25 07:13:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.155) Loss 0.8882 (0.7219) Acc@1 78.467 (83.859) Acc@5 95.801 (96.877) Mem 22339MB +[2024-07-25 07:13:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.445 Acc@5 96.883 +[2024-07-25 07:13:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.4% +[2024-07-25 07:13:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.45% +[2024-07-25 07:13:47 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 07:13:49 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 07:13:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][0/625] eta 0:09:39 lr 0.000559 wd 0.0500 time 0.9270 (0.9270) data time 0.4096 (0.4096) model time 0.0000 (0.0000) loss 8.5984 (8.5984) grad_norm 1.9216 (1.9216) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:13:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][10/625] eta 0:06:33 lr 0.000559 wd 0.0500 time 0.5710 (0.6403) data time 0.0008 (0.0380) model time 0.0000 (0.0000) loss 8.3519 (7.5606) grad_norm 2.5310 (2.2473) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:14:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][20/625] eta 0:06:07 lr 0.000559 wd 0.0500 time 0.5702 (0.6080) data time 0.0006 (0.0203) model time 0.0000 (0.0000) loss 7.0524 (7.4244) grad_norm 1.8505 (2.6701) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:14:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][30/625] eta 0:05:54 lr 0.000559 wd 0.0500 time 0.5707 (0.5962) data time 0.0006 (0.0140) model time 0.0000 (0.0000) loss 7.3034 (7.4238) grad_norm 3.3417 (2.7183) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:14:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][40/625] eta 0:05:45 lr 0.000559 wd 0.0500 time 0.5615 (0.5901) data time 0.0008 (0.0108) model time 0.0000 (0.0000) loss 6.4404 (7.4205) grad_norm 2.1316 (2.7416) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:14:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][50/625] eta 0:05:37 lr 0.000559 wd 0.0500 time 0.5705 (0.5866) data time 0.0008 (0.0088) model time 0.0000 (0.0000) loss 7.6473 (7.3723) grad_norm 1.6114 (2.7088) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:14:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][60/625] eta 0:05:30 lr 0.000559 wd 0.0500 time 0.5678 (0.5847) data time 0.0008 (0.0075) model time 0.5669 (0.5741) loss 9.1812 (7.4370) grad_norm 2.1781 (2.6078) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:14:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][70/625] eta 0:05:23 lr 0.000559 wd 0.0500 time 0.5608 (0.5836) data time 0.0006 (0.0065) model time 0.5602 (0.5753) loss 5.7225 (7.4107) grad_norm 2.1715 (2.5513) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:14:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][80/625] eta 0:05:17 lr 0.000559 wd 0.0500 time 0.5751 (0.5826) data time 0.0006 (0.0058) model time 0.5745 (0.5751) loss 7.3015 (7.4083) grad_norm 1.8542 (2.5306) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:14:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][90/625] eta 0:05:11 lr 0.000558 wd 0.0500 time 0.5739 (0.5817) data time 0.0006 (0.0053) model time 0.5733 (0.5748) loss 7.5589 (7.3924) grad_norm 2.4874 (2.5176) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:14:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][100/625] eta 0:05:05 lr 0.000558 wd 0.0500 time 0.5634 (0.5810) data time 0.0006 (0.0048) model time 0.5628 (0.5747) loss 7.3051 (7.4041) grad_norm 2.9442 (2.5406) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:14:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][110/625] eta 0:04:59 lr 0.000558 wd 0.0500 time 0.5745 (0.5806) data time 0.0007 (0.0045) model time 0.5738 (0.5748) loss 8.0499 (7.4562) grad_norm 2.3243 (2.5439) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:14:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][120/625] eta 0:04:52 lr 0.000558 wd 0.0500 time 0.5744 (0.5802) data time 0.0006 (0.0042) model time 0.5738 (0.5748) loss 7.7688 (7.4256) grad_norm 1.9154 (2.5125) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:15:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][130/625] eta 0:04:46 lr 0.000558 wd 0.0500 time 0.5724 (0.5798) data time 0.0006 (0.0039) model time 0.5718 (0.5746) loss 6.9671 (7.4251) grad_norm 3.5669 (2.5185) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:15:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][140/625] eta 0:04:41 lr 0.000558 wd 0.0500 time 0.5617 (0.5796) data time 0.0007 (0.0037) model time 0.5610 (0.5749) loss 6.2305 (7.4270) grad_norm 3.2171 (2.5093) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:15:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][150/625] eta 0:04:35 lr 0.000558 wd 0.0500 time 0.5753 (0.5796) data time 0.0006 (0.0035) model time 0.5747 (0.5753) loss 7.3306 (7.4565) grad_norm 3.3054 (2.5166) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:15:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][160/625] eta 0:04:29 lr 0.000558 wd 0.0500 time 0.5712 (0.5793) data time 0.0006 (0.0033) model time 0.5706 (0.5752) loss 8.0174 (7.4679) grad_norm 2.9169 (2.5111) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:15:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][170/625] eta 0:04:23 lr 0.000558 wd 0.0500 time 0.5724 (0.5790) data time 0.0008 (0.0032) model time 0.5716 (0.5751) loss 7.8026 (7.4886) grad_norm 3.0646 (2.5120) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:15:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][180/625] eta 0:04:17 lr 0.000557 wd 0.0500 time 0.5647 (0.5788) data time 0.0006 (0.0030) model time 0.5641 (0.5750) loss 8.1530 (7.4551) grad_norm 2.6436 (2.5261) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:15:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][190/625] eta 0:04:12 lr 0.000557 wd 0.0500 time 0.5749 (0.5801) data time 0.0008 (0.0029) model time 0.5741 (0.5769) loss 8.2448 (7.4444) grad_norm 1.5926 (2.5079) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:15:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][200/625] eta 0:04:07 lr 0.000557 wd 0.0500 time 0.5760 (0.5827) data time 0.0010 (0.0028) model time 0.5750 (0.5806) loss 8.0630 (7.4247) grad_norm 2.0254 (2.4949) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:15:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][210/625] eta 0:04:03 lr 0.000557 wd 0.0500 time 0.5741 (0.5858) data time 0.0006 (0.0027) model time 0.5735 (0.5847) loss 7.4444 (7.4367) grad_norm 2.5532 (2.4846) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:15:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][220/625] eta 0:03:58 lr 0.000557 wd 0.0500 time 0.6548 (0.5887) data time 0.0007 (0.0026) model time 0.6541 (0.5886) loss 6.3563 (7.4379) grad_norm 2.3322 (2.4689) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:16:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][230/625] eta 0:03:52 lr 0.000557 wd 0.0500 time 0.6975 (0.5891) data time 0.0007 (0.0026) model time 0.6968 (0.5890) loss 8.6094 (7.4337) grad_norm 2.1613 (2.4674) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:16:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][240/625] eta 0:03:46 lr 0.000557 wd 0.0500 time 0.5728 (0.5884) data time 0.0008 (0.0025) model time 0.5720 (0.5881) loss 8.0718 (7.4404) grad_norm 3.4315 (2.4718) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:16:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][250/625] eta 0:03:40 lr 0.000557 wd 0.0500 time 0.5752 (0.5879) data time 0.0008 (0.0024) model time 0.5744 (0.5874) loss 8.9151 (7.4541) grad_norm 2.5676 (2.4573) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:16:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][260/625] eta 0:03:34 lr 0.000557 wd 0.0500 time 0.5759 (0.5874) data time 0.0008 (0.0024) model time 0.5751 (0.5868) loss 7.5956 (7.4455) grad_norm 2.3607 (2.4682) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:16:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][270/625] eta 0:03:28 lr 0.000557 wd 0.0500 time 0.5734 (0.5869) data time 0.0006 (0.0023) model time 0.5728 (0.5862) loss 6.7530 (7.4694) grad_norm 1.9427 (2.4637) loss_scale 2048.0000 (1061.7860) mem 22339MB +[2024-07-25 07:16:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][280/625] eta 0:03:22 lr 0.000556 wd 0.0500 time 0.5748 (0.5866) data time 0.0008 (0.0022) model time 0.5740 (0.5858) loss 6.7475 (7.4888) grad_norm 2.0625 (2.4563) loss_scale 2048.0000 (1096.8826) mem 22339MB +[2024-07-25 07:16:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][290/625] eta 0:03:16 lr 0.000556 wd 0.0500 time 0.5730 (0.5861) data time 0.0008 (0.0022) model time 0.5721 (0.5853) loss 8.7832 (7.4798) grad_norm 2.3144 (2.4494) loss_scale 2048.0000 (1129.5670) mem 22339MB +[2024-07-25 07:16:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][300/625] eta 0:03:10 lr 0.000556 wd 0.0500 time 0.5727 (0.5858) data time 0.0006 (0.0022) model time 0.5721 (0.5848) loss 6.1239 (7.4829) grad_norm 2.7691 (2.4737) loss_scale 2048.0000 (1160.0797) mem 22339MB +[2024-07-25 07:16:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][310/625] eta 0:03:04 lr 0.000556 wd 0.0500 time 0.5731 (0.5854) data time 0.0007 (0.0021) model time 0.5724 (0.5844) loss 7.8772 (7.4882) grad_norm 4.2618 (2.5047) loss_scale 2048.0000 (1188.6302) mem 22339MB +[2024-07-25 07:16:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][320/625] eta 0:02:58 lr 0.000556 wd 0.0500 time 0.5736 (0.5850) data time 0.0007 (0.0021) model time 0.5729 (0.5840) loss 8.4037 (7.4942) grad_norm 3.4815 (2.5191) loss_scale 2048.0000 (1215.4019) mem 22339MB +[2024-07-25 07:17:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][330/625] eta 0:02:52 lr 0.000556 wd 0.0500 time 0.5742 (0.5848) data time 0.0009 (0.0020) model time 0.5733 (0.5837) loss 8.6012 (7.5061) grad_norm 3.3194 (2.5247) loss_scale 2048.0000 (1240.5559) mem 22339MB +[2024-07-25 07:17:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][340/625] eta 0:02:46 lr 0.000556 wd 0.0500 time 0.5721 (0.5846) data time 0.0008 (0.0020) model time 0.5713 (0.5834) loss 8.9152 (7.4976) grad_norm 3.4536 (2.5360) loss_scale 2048.0000 (1264.2346) mem 22339MB +[2024-07-25 07:17:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][350/625] eta 0:02:40 lr 0.000556 wd 0.0500 time 0.5716 (0.5843) data time 0.0006 (0.0020) model time 0.5710 (0.5831) loss 8.0468 (7.4928) grad_norm 2.8442 (2.5414) loss_scale 2048.0000 (1286.5641) mem 22339MB +[2024-07-25 07:17:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][360/625] eta 0:02:34 lr 0.000556 wd 0.0500 time 0.5726 (0.5840) data time 0.0008 (0.0019) model time 0.5719 (0.5828) loss 6.9107 (7.4958) grad_norm 3.0346 (2.5491) loss_scale 2048.0000 (1307.6565) mem 22339MB +[2024-07-25 07:17:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][370/625] eta 0:02:28 lr 0.000555 wd 0.0500 time 0.5722 (0.5837) data time 0.0006 (0.0019) model time 0.5716 (0.5825) loss 6.0731 (7.4894) grad_norm 2.0612 (2.5649) loss_scale 2048.0000 (1327.6119) mem 22339MB +[2024-07-25 07:17:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][380/625] eta 0:02:22 lr 0.000555 wd 0.0500 time 0.5713 (0.5835) data time 0.0007 (0.0019) model time 0.5706 (0.5822) loss 8.6804 (7.4909) grad_norm 1.7761 (2.5656) loss_scale 2048.0000 (1346.5197) mem 22339MB +[2024-07-25 07:17:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][390/625] eta 0:02:17 lr 0.000555 wd 0.0500 time 0.5743 (0.5833) data time 0.0008 (0.0019) model time 0.5735 (0.5820) loss 7.6470 (7.4837) grad_norm 3.7510 (2.5654) loss_scale 2048.0000 (1364.4604) mem 22339MB +[2024-07-25 07:17:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][400/625] eta 0:02:11 lr 0.000555 wd 0.0500 time 0.5695 (0.5831) data time 0.0006 (0.0018) model time 0.5689 (0.5817) loss 7.0959 (7.4774) grad_norm 2.2672 (2.5576) loss_scale 2048.0000 (1381.5062) mem 22339MB +[2024-07-25 07:17:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][410/625] eta 0:02:05 lr 0.000555 wd 0.0500 time 0.7105 (0.5842) data time 0.0008 (0.0018) model time 0.7098 (0.5830) loss 6.9598 (7.4729) grad_norm 2.1661 (2.5512) loss_scale 2048.0000 (1397.7226) mem 22339MB +[2024-07-25 07:17:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][420/625] eta 0:01:59 lr 0.000555 wd 0.0500 time 0.5604 (0.5852) data time 0.0006 (0.0018) model time 0.5597 (0.5842) loss 6.0789 (7.4709) grad_norm 2.7603 (2.5497) loss_scale 2048.0000 (1413.1686) mem 22339MB +[2024-07-25 07:18:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][430/625] eta 0:01:54 lr 0.000555 wd 0.0500 time 0.6958 (0.5867) data time 0.0006 (0.0018) model time 0.6953 (0.5860) loss 7.4089 (7.4729) grad_norm 2.0267 (2.5492) loss_scale 2048.0000 (1427.8979) mem 22339MB +[2024-07-25 07:18:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][440/625] eta 0:01:48 lr 0.000555 wd 0.0500 time 0.7243 (0.5880) data time 0.0008 (0.0017) model time 0.7235 (0.5874) loss 7.7871 (7.4757) grad_norm 1.7432 (2.5420) loss_scale 2048.0000 (1441.9592) mem 22339MB +[2024-07-25 07:18:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][450/625] eta 0:01:42 lr 0.000555 wd 0.0500 time 0.5680 (0.5881) data time 0.0008 (0.0017) model time 0.5672 (0.5874) loss 8.5627 (7.4723) grad_norm 1.7800 (2.5362) loss_scale 2048.0000 (1455.3969) mem 22339MB +[2024-07-25 07:18:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][460/625] eta 0:01:36 lr 0.000555 wd 0.0500 time 0.5687 (0.5878) data time 0.0007 (0.0017) model time 0.5680 (0.5871) loss 8.4473 (7.4670) grad_norm 2.3783 (2.5241) loss_scale 2048.0000 (1468.2516) mem 22339MB +[2024-07-25 07:18:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][470/625] eta 0:01:31 lr 0.000554 wd 0.0500 time 0.5734 (0.5875) data time 0.0008 (0.0017) model time 0.5727 (0.5868) loss 7.0071 (7.4708) grad_norm 1.7701 (2.5155) loss_scale 2048.0000 (1480.5605) mem 22339MB +[2024-07-25 07:18:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][480/625] eta 0:01:25 lr 0.000554 wd 0.0500 time 0.5684 (0.5872) data time 0.0006 (0.0017) model time 0.5678 (0.5865) loss 7.6882 (7.4730) grad_norm 2.0350 (2.5087) loss_scale 2048.0000 (1492.3576) mem 22339MB +[2024-07-25 07:18:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][490/625] eta 0:01:19 lr 0.000554 wd 0.0500 time 0.5721 (0.5870) data time 0.0006 (0.0016) model time 0.5716 (0.5862) loss 9.3138 (7.4785) grad_norm 3.3749 (2.5120) loss_scale 2048.0000 (1503.6741) mem 22339MB +[2024-07-25 07:18:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][500/625] eta 0:01:13 lr 0.000554 wd 0.0500 time 0.5756 (0.5867) data time 0.0008 (0.0016) model time 0.5748 (0.5859) loss 8.7091 (7.4817) grad_norm 3.3104 (2.5210) loss_scale 2048.0000 (1514.5389) mem 22339MB +[2024-07-25 07:18:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][510/625] eta 0:01:07 lr 0.000554 wd 0.0500 time 0.5722 (0.5865) data time 0.0006 (0.0016) model time 0.5716 (0.5857) loss 8.9302 (7.4808) grad_norm 2.4286 (2.5188) loss_scale 2048.0000 (1524.9785) mem 22339MB +[2024-07-25 07:18:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][520/625] eta 0:01:01 lr 0.000554 wd 0.0500 time 0.5728 (0.5863) data time 0.0006 (0.0016) model time 0.5722 (0.5854) loss 6.7578 (7.4758) grad_norm 1.8925 (2.5102) loss_scale 2048.0000 (1535.0173) mem 22339MB +[2024-07-25 07:19:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][530/625] eta 0:00:55 lr 0.000554 wd 0.0500 time 0.5733 (0.5860) data time 0.0008 (0.0016) model time 0.5725 (0.5852) loss 8.8665 (7.4749) grad_norm 1.5971 (2.5042) loss_scale 2048.0000 (1544.6780) mem 22339MB +[2024-07-25 07:19:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][540/625] eta 0:00:49 lr 0.000554 wd 0.0500 time 0.5738 (0.5858) data time 0.0006 (0.0016) model time 0.5732 (0.5849) loss 7.0658 (7.4716) grad_norm 4.2050 (2.5052) loss_scale 2048.0000 (1553.9815) mem 22339MB +[2024-07-25 07:19:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][550/625] eta 0:00:43 lr 0.000554 wd 0.0500 time 0.5646 (0.5856) data time 0.0008 (0.0015) model time 0.5639 (0.5847) loss 6.2623 (7.4695) grad_norm 3.6796 (2.5090) loss_scale 2048.0000 (1562.9474) mem 22339MB +[2024-07-25 07:19:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][560/625] eta 0:00:38 lr 0.000553 wd 0.0500 time 0.5717 (0.5854) data time 0.0008 (0.0015) model time 0.5709 (0.5845) loss 6.7700 (7.4740) grad_norm 2.0357 (2.5026) loss_scale 2048.0000 (1571.5936) mem 22339MB +[2024-07-25 07:19:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][570/625] eta 0:00:32 lr 0.000553 wd 0.0500 time 0.5718 (0.5852) data time 0.0009 (0.0015) model time 0.5709 (0.5843) loss 6.1522 (7.4657) grad_norm 2.0796 (2.4993) loss_scale 2048.0000 (1579.9370) mem 22339MB +[2024-07-25 07:19:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][580/625] eta 0:00:26 lr 0.000553 wd 0.0500 time 0.5738 (0.5850) data time 0.0008 (0.0015) model time 0.5730 (0.5841) loss 7.9000 (7.4662) grad_norm 2.2944 (2.4974) loss_scale 2048.0000 (1587.9931) mem 22339MB +[2024-07-25 07:19:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][590/625] eta 0:00:20 lr 0.000553 wd 0.0500 time 0.5747 (0.5848) data time 0.0009 (0.0015) model time 0.5738 (0.5839) loss 7.9578 (7.4606) grad_norm 2.3346 (2.4950) loss_scale 2048.0000 (1595.7766) mem 22339MB +[2024-07-25 07:19:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][600/625] eta 0:00:14 lr 0.000553 wd 0.0500 time 0.5726 (0.5847) data time 0.0006 (0.0015) model time 0.5720 (0.5837) loss 6.9694 (7.4644) grad_norm 1.6847 (2.4963) loss_scale 2048.0000 (1603.3012) mem 22339MB +[2024-07-25 07:19:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][610/625] eta 0:00:08 lr 0.000553 wd 0.0500 time 0.5721 (0.5845) data time 0.0004 (0.0015) model time 0.5717 (0.5835) loss 6.5586 (7.4639) grad_norm 5.0877 (2.5007) loss_scale 2048.0000 (1610.5794) mem 22339MB +[2024-07-25 07:19:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [167/300][620/625] eta 0:00:02 lr 0.000553 wd 0.0500 time 0.5722 (0.5843) data time 0.0006 (0.0015) model time 0.5717 (0.5833) loss 8.2581 (7.4592) grad_norm 1.9060 (2.5055) loss_scale 2048.0000 (1617.6232) mem 22339MB +[2024-07-25 07:19:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 167 training takes 0:06:05 +[2024-07-25 07:19:54 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 07:19:56 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 07:19:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.472 (0.472) Loss 0.5107 (0.5107) Acc@1 89.307 (89.307) Acc@5 98.682 (98.682) Mem 22339MB +[2024-07-25 07:19:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.8096 (0.6388) Acc@1 81.934 (86.430) Acc@5 96.191 (97.732) Mem 22339MB +[2024-07-25 07:19:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8760 (0.7441) Acc@1 78.467 (83.382) Acc@5 95.850 (96.673) Mem 22339MB +[2024-07-25 07:19:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.061 Acc@5 96.651 +[2024-07-25 07:19:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.1% +[2024-07-25 07:20:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.799 (0.799) Loss 0.4946 (0.4946) Acc@1 89.941 (89.941) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 07:20:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.187) Loss 0.7617 (0.6230) Acc@1 82.324 (86.914) Acc@5 96.436 (97.843) Mem 22339MB +[2024-07-25 07:20:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.158) Loss 0.8867 (0.7220) Acc@1 78.516 (83.882) Acc@5 95.752 (96.873) Mem 22339MB +[2024-07-25 07:20:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.473 Acc@5 96.873 +[2024-07-25 07:20:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.5% +[2024-07-25 07:20:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.47% +[2024-07-25 07:20:03 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 07:20:04 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 07:20:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][0/625] eta 0:10:08 lr 0.000553 wd 0.0500 time 0.9731 (0.9731) data time 0.4530 (0.4530) model time 0.0000 (0.0000) loss 8.5900 (8.5900) grad_norm 2.2415 (2.2415) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:20:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][10/625] eta 0:06:47 lr 0.000553 wd 0.0500 time 0.7278 (0.6632) data time 0.0006 (0.0419) model time 0.0000 (0.0000) loss 8.7924 (7.9384) grad_norm 2.2174 (2.7370) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:20:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][20/625] eta 0:06:29 lr 0.000553 wd 0.0500 time 0.5702 (0.6433) data time 0.0008 (0.0223) model time 0.0000 (0.0000) loss 7.7367 (7.6869) grad_norm 1.7428 (2.5108) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:20:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][30/625] eta 0:06:23 lr 0.000552 wd 0.0500 time 0.7525 (0.6440) data time 0.0008 (0.0154) model time 0.0000 (0.0000) loss 9.2201 (7.7584) grad_norm 2.2097 (2.3667) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:20:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][40/625] eta 0:06:11 lr 0.000552 wd 0.0500 time 0.5712 (0.6343) data time 0.0007 (0.0118) model time 0.0000 (0.0000) loss 7.1876 (7.5720) grad_norm 2.0135 (2.3092) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:20:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][50/625] eta 0:05:59 lr 0.000552 wd 0.0500 time 0.5749 (0.6250) data time 0.0008 (0.0096) model time 0.0000 (0.0000) loss 7.5569 (7.6370) grad_norm 1.6662 (2.2374) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:20:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][60/625] eta 0:05:48 lr 0.000552 wd 0.0500 time 0.5734 (0.6166) data time 0.0008 (0.0082) model time 0.5727 (0.5725) loss 8.9528 (7.6628) grad_norm 3.4272 (2.2426) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:20:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][70/625] eta 0:05:38 lr 0.000552 wd 0.0500 time 0.5737 (0.6106) data time 0.0008 (0.0071) model time 0.5729 (0.5732) loss 8.0020 (7.6323) grad_norm 2.3097 (2.2348) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:20:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][80/625] eta 0:05:30 lr 0.000552 wd 0.0500 time 0.5733 (0.6063) data time 0.0006 (0.0064) model time 0.5727 (0.5737) loss 6.6213 (7.5338) grad_norm 2.3045 (2.2560) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:20:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][90/625] eta 0:05:22 lr 0.000552 wd 0.0500 time 0.5764 (0.6029) data time 0.0008 (0.0057) model time 0.5756 (0.5739) loss 8.0224 (7.5074) grad_norm 2.3061 (2.2770) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:21:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][100/625] eta 0:05:15 lr 0.000552 wd 0.0500 time 0.5713 (0.6002) data time 0.0006 (0.0053) model time 0.5707 (0.5740) loss 6.8823 (7.5002) grad_norm 1.8445 (2.2940) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:21:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][110/625] eta 0:05:07 lr 0.000552 wd 0.0500 time 0.5775 (0.5980) data time 0.0008 (0.0048) model time 0.5767 (0.5742) loss 7.5908 (7.4747) grad_norm 3.2972 (2.3441) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:21:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][120/625] eta 0:05:00 lr 0.000551 wd 0.0500 time 0.5712 (0.5960) data time 0.0007 (0.0045) model time 0.5705 (0.5741) loss 8.0945 (7.4536) grad_norm 3.7061 (2.3662) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:21:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][130/625] eta 0:04:54 lr 0.000551 wd 0.0500 time 0.5629 (0.5942) data time 0.0008 (0.0042) model time 0.5621 (0.5737) loss 8.6176 (7.4750) grad_norm 4.2713 (2.3870) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:21:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][140/625] eta 0:04:47 lr 0.000551 wd 0.0500 time 0.5729 (0.5928) data time 0.0007 (0.0040) model time 0.5722 (0.5738) loss 7.2965 (7.4734) grad_norm 2.0973 (2.4137) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:21:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][150/625] eta 0:04:40 lr 0.000551 wd 0.0500 time 0.5672 (0.5915) data time 0.0006 (0.0038) model time 0.5666 (0.5736) loss 6.0409 (7.4746) grad_norm 2.0106 (2.4187) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:21:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][160/625] eta 0:04:34 lr 0.000551 wd 0.0500 time 0.5721 (0.5912) data time 0.0006 (0.0036) model time 0.5715 (0.5747) loss 7.4104 (7.4999) grad_norm 2.3175 (2.4116) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:21:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][170/625] eta 0:04:28 lr 0.000551 wd 0.0500 time 0.5718 (0.5902) data time 0.0006 (0.0034) model time 0.5712 (0.5746) loss 7.3854 (7.5063) grad_norm 1.6741 (2.3931) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:21:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][180/625] eta 0:04:22 lr 0.000551 wd 0.0500 time 0.5750 (0.5893) data time 0.0006 (0.0033) model time 0.5744 (0.5745) loss 6.5493 (7.4829) grad_norm 1.8997 (2.4050) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:21:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][190/625] eta 0:04:15 lr 0.000551 wd 0.0500 time 0.5716 (0.5885) data time 0.0006 (0.0031) model time 0.5710 (0.5744) loss 6.7147 (7.4719) grad_norm 2.7428 (2.4019) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:22:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][200/625] eta 0:04:09 lr 0.000551 wd 0.0500 time 0.5718 (0.5877) data time 0.0008 (0.0030) model time 0.5711 (0.5742) loss 6.1219 (7.4529) grad_norm 2.0198 (2.4269) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:22:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][210/625] eta 0:04:03 lr 0.000551 wd 0.0500 time 0.5731 (0.5871) data time 0.0006 (0.0029) model time 0.5725 (0.5743) loss 7.5112 (7.4522) grad_norm 1.9996 (2.4254) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:22:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][220/625] eta 0:03:57 lr 0.000550 wd 0.0500 time 0.6349 (0.5869) data time 0.0006 (0.0028) model time 0.6343 (0.5746) loss 7.9608 (7.4522) grad_norm 1.9896 (2.4035) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:22:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][230/625] eta 0:03:52 lr 0.000550 wd 0.0500 time 0.5645 (0.5875) data time 0.0006 (0.0027) model time 0.5639 (0.5762) loss 6.0377 (7.4414) grad_norm 5.9120 (2.4155) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:22:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][240/625] eta 0:03:47 lr 0.000550 wd 0.0500 time 0.5712 (0.5905) data time 0.0008 (0.0026) model time 0.5704 (0.5804) loss 8.2666 (7.4541) grad_norm 2.3532 (2.4109) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:22:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][250/625] eta 0:03:42 lr 0.000550 wd 0.0500 time 0.7437 (0.5921) data time 0.0008 (0.0026) model time 0.7429 (0.5829) loss 8.2064 (7.4440) grad_norm 2.2886 (2.4141) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:22:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][260/625] eta 0:03:36 lr 0.000550 wd 0.0500 time 0.5702 (0.5930) data time 0.0008 (0.0025) model time 0.5694 (0.5845) loss 7.3314 (7.4528) grad_norm 2.1114 (2.4025) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:22:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][270/625] eta 0:03:30 lr 0.000550 wd 0.0500 time 0.5707 (0.5933) data time 0.0008 (0.0024) model time 0.5699 (0.5851) loss 8.2702 (7.4476) grad_norm 2.5487 (2.4001) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:22:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][280/625] eta 0:03:24 lr 0.000550 wd 0.0500 time 0.5723 (0.5926) data time 0.0008 (0.0024) model time 0.5715 (0.5846) loss 5.5546 (7.4403) grad_norm 1.8678 (2.3861) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:22:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][290/625] eta 0:03:18 lr 0.000550 wd 0.0500 time 0.5862 (0.5920) data time 0.0006 (0.0023) model time 0.5857 (0.5842) loss 7.4407 (7.4356) grad_norm 2.5640 (2.3771) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:23:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][300/625] eta 0:03:12 lr 0.000550 wd 0.0500 time 0.5769 (0.5915) data time 0.0006 (0.0023) model time 0.5763 (0.5839) loss 5.7112 (7.4328) grad_norm 1.8106 (2.3691) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:23:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][310/625] eta 0:03:06 lr 0.000549 wd 0.0500 time 0.5736 (0.5910) data time 0.0006 (0.0022) model time 0.5730 (0.5835) loss 6.1138 (7.4363) grad_norm 1.7018 (2.3615) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:23:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][320/625] eta 0:03:00 lr 0.000549 wd 0.0500 time 0.5746 (0.5904) data time 0.0006 (0.0022) model time 0.5741 (0.5831) loss 6.9744 (7.4330) grad_norm 2.2811 (2.3723) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:23:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][330/625] eta 0:02:54 lr 0.000549 wd 0.0500 time 0.5732 (0.5900) data time 0.0008 (0.0021) model time 0.5724 (0.5828) loss 7.7160 (7.4326) grad_norm 2.2316 (2.3625) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:23:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][340/625] eta 0:02:48 lr 0.000549 wd 0.0500 time 0.5749 (0.5896) data time 0.0008 (0.0021) model time 0.5741 (0.5826) loss 8.1337 (7.4321) grad_norm 1.5652 (2.3593) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:23:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][350/625] eta 0:02:42 lr 0.000549 wd 0.0500 time 0.5738 (0.5891) data time 0.0008 (0.0021) model time 0.5730 (0.5822) loss 7.9519 (7.4307) grad_norm 2.3608 (2.3598) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:23:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][360/625] eta 0:02:36 lr 0.000549 wd 0.0500 time 0.5718 (0.5887) data time 0.0008 (0.0020) model time 0.5710 (0.5820) loss 7.9726 (7.4513) grad_norm 1.7934 (2.3550) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:23:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][370/625] eta 0:02:30 lr 0.000549 wd 0.0500 time 0.5726 (0.5883) data time 0.0008 (0.0020) model time 0.5719 (0.5817) loss 7.7621 (7.4491) grad_norm 2.2693 (2.3475) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:23:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][380/625] eta 0:02:24 lr 0.000549 wd 0.0500 time 0.5700 (0.5883) data time 0.0006 (0.0020) model time 0.5694 (0.5819) loss 8.1317 (7.4511) grad_norm 5.1363 (2.3737) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:23:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][390/625] eta 0:02:18 lr 0.000549 wd 0.0500 time 0.5740 (0.5880) data time 0.0006 (0.0019) model time 0.5734 (0.5817) loss 6.4251 (7.4439) grad_norm 1.8461 (2.3899) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:24:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][400/625] eta 0:02:12 lr 0.000549 wd 0.0500 time 0.5705 (0.5877) data time 0.0008 (0.0019) model time 0.5697 (0.5815) loss 8.4156 (7.4407) grad_norm 2.4045 (2.3804) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:24:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][410/625] eta 0:02:06 lr 0.000548 wd 0.0500 time 0.5728 (0.5874) data time 0.0009 (0.0019) model time 0.5719 (0.5813) loss 8.2252 (7.4457) grad_norm 2.0941 (2.3736) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:24:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][420/625] eta 0:02:00 lr 0.000548 wd 0.0500 time 0.5748 (0.5872) data time 0.0006 (0.0018) model time 0.5742 (0.5812) loss 6.8152 (7.4496) grad_norm 1.6537 (2.3679) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:24:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][430/625] eta 0:01:54 lr 0.000548 wd 0.0500 time 0.5737 (0.5869) data time 0.0008 (0.0018) model time 0.5729 (0.5810) loss 7.6705 (7.4645) grad_norm 1.6909 (2.3697) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:24:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][440/625] eta 0:01:48 lr 0.000548 wd 0.0500 time 0.7064 (0.5869) data time 0.0006 (0.0018) model time 0.7058 (0.5812) loss 7.7175 (7.4691) grad_norm 2.2107 (2.3764) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:24:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][450/625] eta 0:01:42 lr 0.000548 wd 0.0500 time 0.5755 (0.5869) data time 0.0006 (0.0018) model time 0.5749 (0.5813) loss 6.7289 (7.4685) grad_norm 2.5946 (2.3823) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:24:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][460/625] eta 0:01:37 lr 0.000548 wd 0.0500 time 0.7309 (0.5880) data time 0.0007 (0.0018) model time 0.7302 (0.5826) loss 8.1310 (7.4769) grad_norm 2.0678 (2.3773) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:24:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][470/625] eta 0:01:31 lr 0.000548 wd 0.0500 time 0.7427 (0.5891) data time 0.0007 (0.0017) model time 0.7420 (0.5839) loss 7.8157 (7.4838) grad_norm 1.8422 (2.3739) loss_scale 2048.0000 (2048.0000) mem 22339MB +[2024-07-25 07:24:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][480/625] eta 0:01:25 lr 0.000548 wd 0.0500 time 0.7314 (0.5901) data time 0.0007 (0.0017) model time 0.7307 (0.5851) loss 7.5186 (7.4879) grad_norm 1.6149 (inf) loss_scale 1024.0000 (2026.7110) mem 22339MB +[2024-07-25 07:24:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][490/625] eta 0:01:19 lr 0.000548 wd 0.0500 time 0.5723 (0.5898) data time 0.0006 (0.0017) model time 0.5716 (0.5849) loss 6.8780 (7.4805) grad_norm 1.9829 (inf) loss_scale 1024.0000 (2006.2892) mem 22339MB +[2024-07-25 07:25:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][500/625] eta 0:01:13 lr 0.000547 wd 0.0500 time 0.5722 (0.5895) data time 0.0006 (0.0017) model time 0.5716 (0.5847) loss 7.7809 (7.4709) grad_norm 1.9516 (inf) loss_scale 1024.0000 (1986.6826) mem 22339MB +[2024-07-25 07:25:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][510/625] eta 0:01:07 lr 0.000547 wd 0.0500 time 0.5676 (0.5892) data time 0.0007 (0.0017) model time 0.5668 (0.5845) loss 7.2373 (7.4746) grad_norm 1.6435 (inf) loss_scale 1024.0000 (1967.8434) mem 22339MB +[2024-07-25 07:25:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][520/625] eta 0:01:01 lr 0.000547 wd 0.0500 time 0.5734 (0.5890) data time 0.0008 (0.0017) model time 0.5726 (0.5842) loss 6.4534 (7.4720) grad_norm 1.6572 (inf) loss_scale 1024.0000 (1949.7274) mem 22339MB +[2024-07-25 07:25:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][530/625] eta 0:00:55 lr 0.000547 wd 0.0500 time 0.5745 (0.5887) data time 0.0007 (0.0017) model time 0.5737 (0.5841) loss 8.7413 (7.4767) grad_norm 2.3396 (inf) loss_scale 1024.0000 (1932.2938) mem 22339MB +[2024-07-25 07:25:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][540/625] eta 0:00:50 lr 0.000547 wd 0.0500 time 0.5708 (0.5885) data time 0.0008 (0.0016) model time 0.5701 (0.5838) loss 8.4752 (7.4723) grad_norm 1.7418 (inf) loss_scale 1024.0000 (1915.5046) mem 22339MB +[2024-07-25 07:25:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][550/625] eta 0:00:44 lr 0.000547 wd 0.0500 time 0.5746 (0.5882) data time 0.0006 (0.0016) model time 0.5740 (0.5836) loss 7.7984 (7.4727) grad_norm 5.7161 (inf) loss_scale 1024.0000 (1899.3249) mem 22339MB +[2024-07-25 07:25:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][560/625] eta 0:00:38 lr 0.000547 wd 0.0500 time 0.5706 (0.5880) data time 0.0009 (0.0016) model time 0.5697 (0.5834) loss 6.5300 (7.4692) grad_norm 1.9497 (inf) loss_scale 1024.0000 (1883.7219) mem 22339MB +[2024-07-25 07:25:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][570/625] eta 0:00:32 lr 0.000547 wd 0.0500 time 0.5732 (0.5877) data time 0.0008 (0.0016) model time 0.5725 (0.5833) loss 7.9707 (7.4623) grad_norm 2.7714 (inf) loss_scale 1024.0000 (1868.6655) mem 22339MB +[2024-07-25 07:25:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][580/625] eta 0:00:26 lr 0.000547 wd 0.0500 time 0.5741 (0.5875) data time 0.0008 (0.0016) model time 0.5733 (0.5831) loss 8.9290 (7.4548) grad_norm 2.1457 (inf) loss_scale 1024.0000 (1854.1274) mem 22339MB +[2024-07-25 07:25:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][590/625] eta 0:00:20 lr 0.000546 wd 0.0500 time 0.5769 (0.5873) data time 0.0008 (0.0016) model time 0.5762 (0.5829) loss 7.1151 (7.4533) grad_norm 2.1517 (inf) loss_scale 1024.0000 (1840.0812) mem 22339MB +[2024-07-25 07:25:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][600/625] eta 0:00:14 lr 0.000546 wd 0.0500 time 0.5738 (0.5873) data time 0.0008 (0.0016) model time 0.5730 (0.5830) loss 6.3851 (7.4474) grad_norm 2.8336 (inf) loss_scale 1024.0000 (1826.5025) mem 22339MB +[2024-07-25 07:26:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][610/625] eta 0:00:08 lr 0.000546 wd 0.0500 time 0.5678 (0.5871) data time 0.0006 (0.0016) model time 0.5672 (0.5829) loss 8.6888 (7.4542) grad_norm 1.8961 (inf) loss_scale 1024.0000 (1813.3682) mem 22339MB +[2024-07-25 07:26:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [168/300][620/625] eta 0:00:02 lr 0.000546 wd 0.0500 time 0.5694 (0.5869) data time 0.0005 (0.0015) model time 0.5689 (0.5827) loss 7.3826 (7.4562) grad_norm 2.3308 (inf) loss_scale 1024.0000 (1800.6570) mem 22339MB +[2024-07-25 07:26:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 168 training takes 0:06:06 +[2024-07-25 07:26:11 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 07:26:13 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 07:26:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.477 (0.477) Loss 0.5010 (0.5010) Acc@1 89.453 (89.453) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 07:26:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.7739 (0.6266) Acc@1 82.520 (86.683) Acc@5 96.436 (97.807) Mem 22339MB +[2024-07-25 07:26:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9009 (0.7332) Acc@1 76.807 (83.529) Acc@5 95.410 (96.761) Mem 22339MB +[2024-07-25 07:26:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.163 Acc@5 96.743 +[2024-07-25 07:26:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.2% +[2024-07-25 07:26:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 83.16% +[2024-07-25 07:26:16 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 07:26:18 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 07:26:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.485 (0.485) Loss 0.4946 (0.4946) Acc@1 89.941 (89.941) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 07:26:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.158) Loss 0.7627 (0.6230) Acc@1 82.324 (86.892) Acc@5 96.436 (97.856) Mem 22339MB +[2024-07-25 07:26:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.8867 (0.7219) Acc@1 78.564 (83.866) Acc@5 95.752 (96.875) Mem 22339MB +[2024-07-25 07:26:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.459 Acc@5 96.877 +[2024-07-25 07:26:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.5% +[2024-07-25 07:26:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][0/625] eta 0:14:38 lr 0.000546 wd 0.0500 time 1.4060 (1.4060) data time 0.4818 (0.4818) model time 0.0000 (0.0000) loss 6.6614 (6.6614) grad_norm 1.9263 (1.9263) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:26:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][10/625] eta 0:06:39 lr 0.000546 wd 0.0500 time 0.5737 (0.6491) data time 0.0007 (0.0467) model time 0.0000 (0.0000) loss 9.2026 (7.6477) grad_norm 2.4213 (2.3000) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:26:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][20/625] eta 0:06:11 lr 0.000546 wd 0.0500 time 0.5744 (0.6135) data time 0.0007 (0.0249) model time 0.0000 (0.0000) loss 8.6030 (7.8456) grad_norm 2.1022 (3.4571) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:26:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][30/625] eta 0:05:57 lr 0.000546 wd 0.0500 time 0.5740 (0.6007) data time 0.0007 (0.0171) model time 0.0000 (0.0000) loss 6.7200 (7.6956) grad_norm 2.5790 (3.2522) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:26:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][40/625] eta 0:05:50 lr 0.000546 wd 0.0500 time 0.5746 (0.5992) data time 0.0007 (0.0132) model time 0.0000 (0.0000) loss 6.6355 (7.5859) grad_norm 2.2291 (3.1201) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:26:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][50/625] eta 0:05:47 lr 0.000546 wd 0.0500 time 0.7246 (0.6049) data time 0.0006 (0.0107) model time 0.0000 (0.0000) loss 7.3825 (7.5569) grad_norm 3.5270 (3.1423) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:26:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][60/625] eta 0:05:44 lr 0.000545 wd 0.0500 time 0.5734 (0.6089) data time 0.0006 (0.0091) model time 0.5728 (0.6282) loss 7.1330 (7.5074) grad_norm 4.6889 (3.2393) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:27:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][70/625] eta 0:05:42 lr 0.000545 wd 0.0500 time 0.5754 (0.6173) data time 0.0008 (0.0079) model time 0.5746 (0.6481) loss 8.0206 (7.4976) grad_norm 2.4197 (3.1813) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:27:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][80/625] eta 0:05:35 lr 0.000545 wd 0.0500 time 0.6280 (0.6153) data time 0.0006 (0.0071) model time 0.6274 (0.6322) loss 7.4272 (7.5016) grad_norm 3.0999 (3.3775) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:27:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][90/625] eta 0:05:26 lr 0.000545 wd 0.0500 time 0.5720 (0.6107) data time 0.0008 (0.0064) model time 0.5712 (0.6172) loss 6.0699 (7.4736) grad_norm 2.4848 (3.3463) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:27:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][100/625] eta 0:05:18 lr 0.000545 wd 0.0500 time 0.5723 (0.6071) data time 0.0008 (0.0058) model time 0.5715 (0.6084) loss 8.5770 (7.4700) grad_norm 2.2096 (3.2653) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:27:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][110/625] eta 0:05:11 lr 0.000545 wd 0.0500 time 0.5720 (0.6041) data time 0.0008 (0.0054) model time 0.5712 (0.6026) loss 9.1260 (7.4696) grad_norm 2.4147 (3.1883) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:27:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][120/625] eta 0:05:03 lr 0.000545 wd 0.0500 time 0.5757 (0.6017) data time 0.0006 (0.0050) model time 0.5752 (0.5985) loss 6.3805 (7.4382) grad_norm 3.1505 (3.1268) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:27:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][130/625] eta 0:04:56 lr 0.000545 wd 0.0500 time 0.5757 (0.5996) data time 0.0008 (0.0047) model time 0.5749 (0.5954) loss 7.9211 (7.4563) grad_norm 1.6090 (3.0788) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:27:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][140/625] eta 0:04:50 lr 0.000545 wd 0.0500 time 0.5731 (0.5983) data time 0.0009 (0.0044) model time 0.5722 (0.5937) loss 8.7505 (7.4935) grad_norm 2.5597 (3.0177) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:27:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][150/625] eta 0:04:43 lr 0.000545 wd 0.0500 time 0.5760 (0.5967) data time 0.0006 (0.0042) model time 0.5754 (0.5917) loss 5.4114 (7.5029) grad_norm 1.9754 (2.9569) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:27:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][160/625] eta 0:04:36 lr 0.000544 wd 0.0500 time 0.5779 (0.5953) data time 0.0009 (0.0039) model time 0.5770 (0.5901) loss 7.2105 (7.4747) grad_norm 2.9641 (2.9285) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:28:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][170/625] eta 0:04:30 lr 0.000544 wd 0.0500 time 0.5777 (0.5941) data time 0.0006 (0.0038) model time 0.5771 (0.5888) loss 8.5332 (7.4703) grad_norm 3.8803 (2.9123) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:28:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][180/625] eta 0:04:23 lr 0.000544 wd 0.0500 time 0.5888 (0.5932) data time 0.0008 (0.0036) model time 0.5880 (0.5878) loss 6.7034 (7.4692) grad_norm 3.3491 (2.9151) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:28:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][190/625] eta 0:04:17 lr 0.000544 wd 0.0500 time 0.5772 (0.5923) data time 0.0008 (0.0035) model time 0.5765 (0.5869) loss 7.5993 (7.4506) grad_norm 3.0579 (2.9190) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:28:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][200/625] eta 0:04:11 lr 0.000544 wd 0.0500 time 0.5770 (0.5915) data time 0.0008 (0.0033) model time 0.5762 (0.5861) loss 8.6172 (7.4480) grad_norm 1.9974 (2.9211) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:28:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][210/625] eta 0:04:05 lr 0.000544 wd 0.0500 time 0.5767 (0.5908) data time 0.0007 (0.0032) model time 0.5760 (0.5855) loss 7.0500 (7.4380) grad_norm 2.2787 (2.8834) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:28:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][220/625] eta 0:03:58 lr 0.000544 wd 0.0500 time 0.5750 (0.5901) data time 0.0007 (0.0031) model time 0.5742 (0.5848) loss 8.3770 (7.4415) grad_norm 1.8344 (2.8520) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:28:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][230/625] eta 0:03:52 lr 0.000544 wd 0.0500 time 0.5744 (0.5894) data time 0.0006 (0.0030) model time 0.5738 (0.5843) loss 8.3343 (7.4320) grad_norm 1.9823 (2.8156) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:28:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][240/625] eta 0:03:46 lr 0.000544 wd 0.0500 time 0.5758 (0.5889) data time 0.0008 (0.0029) model time 0.5750 (0.5837) loss 7.4876 (7.4368) grad_norm 1.7427 (2.7954) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:28:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][250/625] eta 0:03:40 lr 0.000543 wd 0.0500 time 0.5763 (0.5883) data time 0.0006 (0.0028) model time 0.5757 (0.5833) loss 6.5449 (7.4264) grad_norm 1.4669 (2.7660) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:28:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][260/625] eta 0:03:34 lr 0.000543 wd 0.0500 time 0.5751 (0.5883) data time 0.0008 (0.0027) model time 0.5743 (0.5835) loss 7.3971 (7.4288) grad_norm 2.4087 (2.7536) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:29:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][270/625] eta 0:03:29 lr 0.000543 wd 0.0500 time 0.7343 (0.5905) data time 0.0006 (0.0027) model time 0.7337 (0.5863) loss 8.7671 (7.4310) grad_norm 2.5515 (2.7469) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:29:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][280/625] eta 0:03:23 lr 0.000543 wd 0.0500 time 0.5770 (0.5913) data time 0.0006 (0.0026) model time 0.5764 (0.5874) loss 6.6389 (7.4257) grad_norm 2.3490 (2.7417) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:29:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][290/625] eta 0:03:18 lr 0.000543 wd 0.0500 time 0.5809 (0.5931) data time 0.0008 (0.0025) model time 0.5801 (0.5898) loss 6.7643 (7.4139) grad_norm 5.1751 (2.7322) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:29:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][300/625] eta 0:03:12 lr 0.000543 wd 0.0500 time 0.5748 (0.5934) data time 0.0007 (0.0025) model time 0.5742 (0.5902) loss 7.2046 (7.4102) grad_norm 2.2596 (2.7240) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:29:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][310/625] eta 0:03:06 lr 0.000543 wd 0.0500 time 0.5744 (0.5929) data time 0.0006 (0.0024) model time 0.5738 (0.5897) loss 7.4120 (7.4090) grad_norm 2.3492 (2.7092) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:29:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][320/625] eta 0:03:00 lr 0.000543 wd 0.0500 time 0.5745 (0.5923) data time 0.0008 (0.0024) model time 0.5737 (0.5891) loss 8.1286 (7.4107) grad_norm 4.2537 (2.7246) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:29:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][330/625] eta 0:02:54 lr 0.000543 wd 0.0500 time 0.5756 (0.5918) data time 0.0006 (0.0023) model time 0.5750 (0.5886) loss 6.5937 (7.4014) grad_norm 2.6740 (2.7303) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:29:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][340/625] eta 0:02:48 lr 0.000543 wd 0.0500 time 0.5782 (0.5913) data time 0.0007 (0.0023) model time 0.5775 (0.5881) loss 6.5959 (7.3947) grad_norm 3.8740 (2.7325) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:29:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][350/625] eta 0:02:42 lr 0.000542 wd 0.0500 time 0.5776 (0.5908) data time 0.0006 (0.0022) model time 0.5769 (0.5876) loss 7.5472 (7.4008) grad_norm 1.6937 (2.7116) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:29:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][360/625] eta 0:02:36 lr 0.000542 wd 0.0500 time 0.5750 (0.5904) data time 0.0008 (0.0022) model time 0.5742 (0.5873) loss 8.6115 (7.4060) grad_norm 2.5272 (2.6866) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:30:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][370/625] eta 0:02:30 lr 0.000542 wd 0.0500 time 0.5812 (0.5900) data time 0.0007 (0.0022) model time 0.5804 (0.5869) loss 7.6985 (7.4084) grad_norm 2.1497 (2.6920) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:30:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][380/625] eta 0:02:24 lr 0.000542 wd 0.0500 time 0.5746 (0.5896) data time 0.0006 (0.0021) model time 0.5740 (0.5865) loss 7.5589 (7.4073) grad_norm 2.0631 (2.6810) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:30:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][390/625] eta 0:02:18 lr 0.000542 wd 0.0500 time 0.5765 (0.5893) data time 0.0006 (0.0021) model time 0.5759 (0.5862) loss 8.6350 (7.4056) grad_norm 2.1670 (2.6664) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:30:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][400/625] eta 0:02:12 lr 0.000542 wd 0.0500 time 0.5708 (0.5889) data time 0.0006 (0.0021) model time 0.5702 (0.5857) loss 7.0072 (7.4063) grad_norm 2.0022 (2.6581) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:30:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][410/625] eta 0:02:06 lr 0.000542 wd 0.0500 time 0.5828 (0.5885) data time 0.0006 (0.0020) model time 0.5822 (0.5854) loss 6.2681 (7.4139) grad_norm 2.3503 (2.6477) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:30:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][420/625] eta 0:02:00 lr 0.000542 wd 0.0500 time 0.5740 (0.5882) data time 0.0006 (0.0020) model time 0.5734 (0.5852) loss 7.8916 (7.4072) grad_norm 2.2228 (2.6401) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:30:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][430/625] eta 0:01:54 lr 0.000542 wd 0.0500 time 0.5778 (0.5879) data time 0.0008 (0.0020) model time 0.5770 (0.5849) loss 8.5030 (7.4025) grad_norm 1.9867 (2.6371) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:30:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][440/625] eta 0:01:48 lr 0.000541 wd 0.0500 time 0.5863 (0.5877) data time 0.0008 (0.0019) model time 0.5855 (0.5847) loss 7.9735 (7.4119) grad_norm 2.7383 (2.6336) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:30:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][450/625] eta 0:01:42 lr 0.000541 wd 0.0500 time 0.5742 (0.5874) data time 0.0008 (0.0019) model time 0.5734 (0.5844) loss 6.7738 (7.4162) grad_norm 1.9885 (2.6278) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:30:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][460/625] eta 0:01:36 lr 0.000541 wd 0.0500 time 0.5731 (0.5871) data time 0.0008 (0.0019) model time 0.5723 (0.5841) loss 7.4366 (7.4100) grad_norm 2.9626 (2.6318) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:30:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][470/625] eta 0:01:30 lr 0.000541 wd 0.0500 time 0.5713 (0.5869) data time 0.0006 (0.0019) model time 0.5707 (0.5839) loss 6.8626 (7.4122) grad_norm 3.1451 (2.6298) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:31:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][480/625] eta 0:01:25 lr 0.000541 wd 0.0500 time 0.5778 (0.5870) data time 0.0008 (0.0018) model time 0.5770 (0.5841) loss 6.6535 (7.4093) grad_norm 2.7471 (2.6312) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:31:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][490/625] eta 0:01:19 lr 0.000541 wd 0.0500 time 0.7277 (0.5883) data time 0.0007 (0.0018) model time 0.7270 (0.5856) loss 7.8060 (7.4186) grad_norm 2.1540 (2.6341) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:31:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][500/625] eta 0:01:13 lr 0.000541 wd 0.0500 time 0.7320 (0.5888) data time 0.0008 (0.0018) model time 0.7312 (0.5862) loss 6.3543 (7.4229) grad_norm 2.5653 (2.6392) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:31:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][510/625] eta 0:01:07 lr 0.000541 wd 0.0500 time 0.7776 (0.5901) data time 0.0008 (0.0018) model time 0.7769 (0.5877) loss 7.9209 (7.4178) grad_norm 2.2805 (2.6314) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:31:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][520/625] eta 0:01:01 lr 0.000541 wd 0.0500 time 0.5767 (0.5899) data time 0.0008 (0.0018) model time 0.5759 (0.5875) loss 7.9159 (7.4205) grad_norm 1.6743 (2.6295) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:31:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][530/625] eta 0:00:56 lr 0.000540 wd 0.0500 time 0.5687 (0.5897) data time 0.0006 (0.0017) model time 0.5681 (0.5872) loss 6.6015 (7.4299) grad_norm 1.5860 (2.6189) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:31:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][540/625] eta 0:00:50 lr 0.000540 wd 0.0500 time 0.5755 (0.5894) data time 0.0007 (0.0017) model time 0.5748 (0.5870) loss 7.4992 (7.4303) grad_norm 1.8008 (2.6161) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:31:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][550/625] eta 0:00:44 lr 0.000540 wd 0.0500 time 0.5743 (0.5891) data time 0.0006 (0.0017) model time 0.5736 (0.5867) loss 7.2196 (7.4333) grad_norm 2.2299 (2.6192) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:31:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][560/625] eta 0:00:38 lr 0.000540 wd 0.0500 time 0.5776 (0.5889) data time 0.0008 (0.0017) model time 0.5768 (0.5865) loss 6.4994 (7.4342) grad_norm 1.9036 (2.6145) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:31:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][570/625] eta 0:00:32 lr 0.000540 wd 0.0500 time 0.5754 (0.5886) data time 0.0006 (0.0017) model time 0.5747 (0.5862) loss 7.2689 (7.4424) grad_norm 2.8271 (2.6146) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:32:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][580/625] eta 0:00:26 lr 0.000540 wd 0.0500 time 0.5747 (0.5886) data time 0.0007 (0.0017) model time 0.5740 (0.5862) loss 7.7573 (7.4401) grad_norm 2.1964 (2.6125) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:32:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][590/625] eta 0:00:20 lr 0.000540 wd 0.0500 time 0.5749 (0.5883) data time 0.0008 (0.0017) model time 0.5741 (0.5860) loss 6.5612 (7.4472) grad_norm 2.2143 (2.6108) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:32:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][600/625] eta 0:00:14 lr 0.000540 wd 0.0500 time 0.5778 (0.5881) data time 0.0006 (0.0016) model time 0.5772 (0.5857) loss 8.1029 (7.4410) grad_norm 1.8270 (2.6017) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:32:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][610/625] eta 0:00:08 lr 0.000540 wd 0.0500 time 0.5739 (0.5879) data time 0.0005 (0.0016) model time 0.5733 (0.5855) loss 9.1022 (7.4442) grad_norm 2.8506 (2.5927) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:32:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [169/300][620/625] eta 0:00:02 lr 0.000540 wd 0.0500 time 0.5848 (0.5877) data time 0.0003 (0.0016) model time 0.5844 (0.5853) loss 6.8704 (7.4433) grad_norm 2.1943 (2.5844) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:32:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 169 training takes 0:06:07 +[2024-07-25 07:32:28 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 07:32:30 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 07:32:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.503 (0.503) Loss 0.5215 (0.5215) Acc@1 88.965 (88.965) Acc@5 98.730 (98.730) Mem 22339MB +[2024-07-25 07:32:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.160) Loss 0.7891 (0.6377) Acc@1 81.055 (86.266) Acc@5 96.191 (97.847) Mem 22339MB +[2024-07-25 07:32:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.143) Loss 0.8926 (0.7442) Acc@1 78.027 (83.254) Acc@5 95.459 (96.735) Mem 22339MB +[2024-07-25 07:32:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 82.937 Acc@5 96.703 +[2024-07-25 07:32:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 82.9% +[2024-07-25 07:32:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.814 (0.814) Loss 0.4954 (0.4954) Acc@1 89.893 (89.893) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 07:32:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.188) Loss 0.7622 (0.6228) Acc@1 82.471 (86.879) Acc@5 96.436 (97.852) Mem 22339MB +[2024-07-25 07:32:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.159) Loss 0.8862 (0.7215) Acc@1 78.467 (83.870) Acc@5 95.752 (96.880) Mem 22339MB +[2024-07-25 07:32:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.463 Acc@5 96.887 +[2024-07-25 07:32:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.5% +[2024-07-25 07:32:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][0/625] eta 0:16:05 lr 0.000539 wd 0.0500 time 1.5452 (1.5452) data time 0.9693 (0.9693) model time 0.0000 (0.0000) loss 6.0167 (6.0167) grad_norm 2.9246 (2.9246) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:32:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][10/625] eta 0:06:48 lr 0.000539 wd 0.0500 time 0.5681 (0.6640) data time 0.0007 (0.0888) model time 0.0000 (0.0000) loss 6.5933 (7.0129) grad_norm 2.1978 (2.5046) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:32:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][20/625] eta 0:06:15 lr 0.000539 wd 0.0500 time 0.5672 (0.6206) data time 0.0008 (0.0469) model time 0.0000 (0.0000) loss 5.8696 (7.0665) grad_norm 2.9028 (2.4755) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:32:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][30/625] eta 0:06:00 lr 0.000539 wd 0.0500 time 0.5720 (0.6060) data time 0.0008 (0.0320) model time 0.0000 (0.0000) loss 5.6134 (7.1614) grad_norm 2.5732 (2.6516) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:33:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][40/625] eta 0:05:49 lr 0.000539 wd 0.0500 time 0.5750 (0.5982) data time 0.0007 (0.0244) model time 0.0000 (0.0000) loss 6.0966 (7.1145) grad_norm 3.0500 (2.6595) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:33:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][50/625] eta 0:05:41 lr 0.000539 wd 0.0500 time 0.5769 (0.5937) data time 0.0008 (0.0198) model time 0.0000 (0.0000) loss 7.6314 (7.2162) grad_norm 2.0048 (2.5271) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:33:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][60/625] eta 0:05:33 lr 0.000539 wd 0.0500 time 0.5740 (0.5907) data time 0.0007 (0.0167) model time 0.5732 (0.5742) loss 8.1367 (7.2638) grad_norm 3.0488 (2.4643) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:33:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][70/625] eta 0:05:26 lr 0.000539 wd 0.0500 time 0.5734 (0.5885) data time 0.0008 (0.0144) model time 0.5726 (0.5745) loss 7.1888 (7.3380) grad_norm 2.1674 (2.5444) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:33:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][80/625] eta 0:05:22 lr 0.000539 wd 0.0500 time 0.7729 (0.5914) data time 0.0006 (0.0128) model time 0.7722 (0.5866) loss 7.8867 (7.3955) grad_norm 1.8735 (2.5651) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:33:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][90/625] eta 0:05:19 lr 0.000539 wd 0.0500 time 0.7224 (0.5979) data time 0.0008 (0.0114) model time 0.7216 (0.6025) loss 7.2440 (7.4021) grad_norm 1.7419 (2.5835) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:33:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][100/625] eta 0:05:15 lr 0.000538 wd 0.0500 time 0.5735 (0.6001) data time 0.0008 (0.0104) model time 0.5727 (0.6057) loss 5.9591 (7.3949) grad_norm 1.5977 (2.5540) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:33:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][110/625] eta 0:05:11 lr 0.000538 wd 0.0500 time 0.5727 (0.6042) data time 0.0007 (0.0095) model time 0.5720 (0.6123) loss 7.9485 (7.3574) grad_norm 1.3644 (2.4874) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:33:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][120/625] eta 0:05:04 lr 0.000538 wd 0.0500 time 0.7182 (0.6032) data time 0.0008 (0.0088) model time 0.7175 (0.6094) loss 8.4463 (7.4043) grad_norm 2.4277 (2.4598) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:33:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][130/625] eta 0:04:58 lr 0.000538 wd 0.0500 time 0.5766 (0.6022) data time 0.0006 (0.0082) model time 0.5760 (0.6067) loss 8.5033 (7.3576) grad_norm 2.3545 (2.4123) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:34:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][140/625] eta 0:04:51 lr 0.000538 wd 0.0500 time 0.5748 (0.6001) data time 0.0008 (0.0077) model time 0.5740 (0.6030) loss 8.5316 (7.3532) grad_norm 1.7849 (2.3875) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:34:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][150/625] eta 0:04:44 lr 0.000538 wd 0.0500 time 0.5791 (0.5985) data time 0.0008 (0.0072) model time 0.5784 (0.6001) loss 6.2956 (7.3497) grad_norm 1.7329 (2.3632) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:34:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][160/625] eta 0:04:37 lr 0.000538 wd 0.0500 time 0.5779 (0.5970) data time 0.0006 (0.0068) model time 0.5772 (0.5978) loss 8.0862 (7.3356) grad_norm 1.8705 (2.3562) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:34:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][170/625] eta 0:04:31 lr 0.000538 wd 0.0500 time 0.5769 (0.5957) data time 0.0006 (0.0065) model time 0.5763 (0.5957) loss 6.1451 (7.3319) grad_norm 1.7279 (2.3335) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:34:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][180/625] eta 0:04:24 lr 0.000538 wd 0.0500 time 0.5752 (0.5947) data time 0.0007 (0.0061) model time 0.5744 (0.5943) loss 8.0220 (7.3422) grad_norm 1.6129 (2.3141) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:34:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][190/625] eta 0:04:18 lr 0.000537 wd 0.0500 time 0.5738 (0.5936) data time 0.0007 (0.0059) model time 0.5731 (0.5928) loss 7.4037 (7.3527) grad_norm 1.9593 (2.3069) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:34:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][200/625] eta 0:04:11 lr 0.000537 wd 0.0500 time 0.5736 (0.5926) data time 0.0006 (0.0056) model time 0.5730 (0.5915) loss 7.1001 (7.3496) grad_norm 2.0342 (2.2970) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:34:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][210/625] eta 0:04:05 lr 0.000537 wd 0.0500 time 0.5792 (0.5919) data time 0.0008 (0.0054) model time 0.5784 (0.5905) loss 7.4944 (7.3474) grad_norm 1.9549 (2.3255) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:34:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][220/625] eta 0:03:59 lr 0.000537 wd 0.0500 time 0.5755 (0.5911) data time 0.0009 (0.0052) model time 0.5747 (0.5895) loss 8.0063 (7.3476) grad_norm 2.1678 (2.3042) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:34:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][230/625] eta 0:03:53 lr 0.000537 wd 0.0500 time 0.5781 (0.5904) data time 0.0008 (0.0050) model time 0.5773 (0.5887) loss 7.6753 (7.3421) grad_norm 2.4830 (2.2913) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:34:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][240/625] eta 0:03:47 lr 0.000537 wd 0.0500 time 0.5734 (0.5897) data time 0.0008 (0.0048) model time 0.5725 (0.5878) loss 7.4915 (7.3484) grad_norm 1.9435 (2.2733) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:35:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][250/625] eta 0:03:40 lr 0.000537 wd 0.0500 time 0.5887 (0.5891) data time 0.0008 (0.0047) model time 0.5879 (0.5871) loss 6.7557 (7.3595) grad_norm 2.5601 (2.3352) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:35:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][260/625] eta 0:03:34 lr 0.000537 wd 0.0500 time 0.5723 (0.5885) data time 0.0008 (0.0045) model time 0.5714 (0.5864) loss 7.2302 (7.3589) grad_norm 2.1742 (2.3310) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:35:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][270/625] eta 0:03:28 lr 0.000537 wd 0.0500 time 0.5768 (0.5880) data time 0.0008 (0.0044) model time 0.5760 (0.5858) loss 7.6888 (7.3607) grad_norm 1.9910 (2.3460) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:35:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][280/625] eta 0:03:22 lr 0.000537 wd 0.0500 time 0.5720 (0.5875) data time 0.0007 (0.0042) model time 0.5714 (0.5853) loss 7.2575 (7.3654) grad_norm 2.7721 (2.3763) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:35:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][290/625] eta 0:03:16 lr 0.000536 wd 0.0500 time 0.5739 (0.5870) data time 0.0008 (0.0041) model time 0.5731 (0.5848) loss 7.6713 (7.3807) grad_norm 1.8669 (2.3920) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:35:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][300/625] eta 0:03:10 lr 0.000536 wd 0.0500 time 0.5746 (0.5872) data time 0.0006 (0.0040) model time 0.5740 (0.5850) loss 7.2223 (7.3785) grad_norm 1.7866 (2.3868) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:35:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][310/625] eta 0:03:05 lr 0.000536 wd 0.0500 time 0.5711 (0.5885) data time 0.0007 (0.0039) model time 0.5704 (0.5866) loss 6.4185 (7.3817) grad_norm 2.6165 (2.3834) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:35:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][320/625] eta 0:02:59 lr 0.000536 wd 0.0500 time 0.5707 (0.5894) data time 0.0008 (0.0038) model time 0.5699 (0.5877) loss 7.2138 (7.3781) grad_norm 1.7071 (2.4012) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:35:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][330/625] eta 0:02:54 lr 0.000536 wd 0.0500 time 0.5754 (0.5906) data time 0.0006 (0.0037) model time 0.5748 (0.5892) loss 7.6940 (7.3858) grad_norm 1.7415 (2.4133) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:35:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][340/625] eta 0:02:48 lr 0.000536 wd 0.0500 time 0.6609 (0.5909) data time 0.0008 (0.0036) model time 0.6601 (0.5895) loss 8.0624 (7.3855) grad_norm 1.7032 (2.4155) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:36:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][350/625] eta 0:02:42 lr 0.000536 wd 0.0500 time 0.5724 (0.5908) data time 0.0007 (0.0036) model time 0.5718 (0.5895) loss 7.5286 (7.3920) grad_norm 1.9323 (2.4054) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:36:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][360/625] eta 0:02:36 lr 0.000536 wd 0.0500 time 0.5771 (0.5903) data time 0.0008 (0.0035) model time 0.5763 (0.5890) loss 6.6419 (7.3695) grad_norm 3.1943 (2.4304) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:36:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][370/625] eta 0:02:30 lr 0.000536 wd 0.0500 time 0.5738 (0.5899) data time 0.0008 (0.0034) model time 0.5730 (0.5885) loss 8.7569 (7.3810) grad_norm 2.0403 (2.4480) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:36:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][380/625] eta 0:02:24 lr 0.000535 wd 0.0500 time 0.5784 (0.5895) data time 0.0008 (0.0033) model time 0.5777 (0.5880) loss 9.7428 (7.3888) grad_norm 3.2551 (2.4504) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:36:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][390/625] eta 0:02:18 lr 0.000535 wd 0.0500 time 0.5811 (0.5890) data time 0.0006 (0.0033) model time 0.5805 (0.5875) loss 6.9089 (7.3833) grad_norm 1.7194 (2.4444) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:36:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][400/625] eta 0:02:12 lr 0.000535 wd 0.0500 time 0.5728 (0.5886) data time 0.0007 (0.0032) model time 0.5721 (0.5871) loss 8.1471 (7.3947) grad_norm 2.0593 (2.4317) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:36:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][410/625] eta 0:02:06 lr 0.000535 wd 0.0500 time 0.5748 (0.5883) data time 0.0008 (0.0032) model time 0.5740 (0.5867) loss 7.8502 (7.4150) grad_norm 1.6704 (2.4230) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:36:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][420/625] eta 0:02:00 lr 0.000535 wd 0.0500 time 0.5746 (0.5880) data time 0.0006 (0.0031) model time 0.5740 (0.5863) loss 7.6155 (7.4225) grad_norm 1.7333 (2.4151) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:36:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][430/625] eta 0:01:54 lr 0.000535 wd 0.0500 time 0.5784 (0.5877) data time 0.0006 (0.0031) model time 0.5778 (0.5860) loss 7.5773 (7.4280) grad_norm 3.0899 (2.4203) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:36:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][440/625] eta 0:01:48 lr 0.000535 wd 0.0500 time 0.5792 (0.5874) data time 0.0007 (0.0030) model time 0.5785 (0.5858) loss 9.2587 (7.4440) grad_norm 2.3415 (2.4158) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:37:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][450/625] eta 0:01:42 lr 0.000535 wd 0.0500 time 0.5743 (0.5871) data time 0.0006 (0.0030) model time 0.5737 (0.5855) loss 7.7975 (7.4445) grad_norm 2.3781 (2.4193) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:37:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][460/625] eta 0:01:36 lr 0.000535 wd 0.0500 time 0.5725 (0.5868) data time 0.0008 (0.0029) model time 0.5717 (0.5852) loss 8.4149 (7.4525) grad_norm 2.4492 (2.4186) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:37:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][470/625] eta 0:01:30 lr 0.000535 wd 0.0500 time 0.5789 (0.5866) data time 0.0006 (0.0029) model time 0.5782 (0.5849) loss 8.1692 (7.4617) grad_norm 3.3864 (2.4167) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:37:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][480/625] eta 0:01:25 lr 0.000534 wd 0.0500 time 0.5746 (0.5864) data time 0.0006 (0.0028) model time 0.5740 (0.5847) loss 6.3908 (7.4626) grad_norm 2.1691 (2.4130) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:37:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][490/625] eta 0:01:19 lr 0.000534 wd 0.0500 time 0.5748 (0.5861) data time 0.0006 (0.0028) model time 0.5741 (0.5844) loss 7.0537 (7.4475) grad_norm 3.1686 (2.4230) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:37:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][500/625] eta 0:01:13 lr 0.000534 wd 0.0500 time 0.5765 (0.5859) data time 0.0006 (0.0027) model time 0.5758 (0.5842) loss 6.7947 (7.4537) grad_norm 2.8028 (2.4175) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:37:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][510/625] eta 0:01:07 lr 0.000534 wd 0.0500 time 0.7329 (0.5860) data time 0.0007 (0.0027) model time 0.7321 (0.5844) loss 5.8184 (7.4465) grad_norm 1.7546 (2.4126) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:37:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][520/625] eta 0:01:01 lr 0.000534 wd 0.0500 time 0.5759 (0.5859) data time 0.0006 (0.0027) model time 0.5753 (0.5842) loss 7.1577 (7.4452) grad_norm 2.4583 (2.4076) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:37:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][530/625] eta 0:00:55 lr 0.000534 wd 0.0500 time 0.5681 (0.5874) data time 0.0008 (0.0026) model time 0.5673 (0.5860) loss 7.8286 (7.4449) grad_norm 1.7725 (2.4030) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:37:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][540/625] eta 0:00:49 lr 0.000534 wd 0.0500 time 0.7352 (0.5878) data time 0.0006 (0.0026) model time 0.7347 (0.5864) loss 7.9588 (7.4502) grad_norm 1.7956 (2.3968) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:38:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][550/625] eta 0:00:44 lr 0.000534 wd 0.0500 time 0.5773 (0.5881) data time 0.0008 (0.0026) model time 0.5765 (0.5868) loss 6.7297 (7.4530) grad_norm 2.6882 (2.3954) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:38:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][560/625] eta 0:00:38 lr 0.000534 wd 0.0500 time 0.5774 (0.5883) data time 0.0006 (0.0025) model time 0.5768 (0.5869) loss 7.6531 (7.4566) grad_norm 2.0735 (2.4099) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:38:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][570/625] eta 0:00:32 lr 0.000533 wd 0.0500 time 0.5742 (0.5883) data time 0.0008 (0.0025) model time 0.5734 (0.5870) loss 7.8529 (7.4620) grad_norm 3.7154 (2.4155) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:38:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][580/625] eta 0:00:26 lr 0.000533 wd 0.0500 time 0.5735 (0.5880) data time 0.0007 (0.0025) model time 0.5728 (0.5867) loss 6.1923 (7.4657) grad_norm 1.8228 (2.4126) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:38:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][590/625] eta 0:00:20 lr 0.000533 wd 0.0500 time 0.5744 (0.5878) data time 0.0008 (0.0024) model time 0.5736 (0.5864) loss 8.4623 (7.4746) grad_norm 2.5212 (2.4106) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:38:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][600/625] eta 0:00:14 lr 0.000533 wd 0.0500 time 0.5862 (0.5876) data time 0.0008 (0.0024) model time 0.5854 (0.5863) loss 8.8307 (7.4714) grad_norm 1.9217 (2.4081) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:38:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][610/625] eta 0:00:08 lr 0.000533 wd 0.0500 time 0.5705 (0.5874) data time 0.0006 (0.0024) model time 0.5699 (0.5860) loss 9.0568 (7.4717) grad_norm 1.7573 (2.4149) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:38:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [170/300][620/625] eta 0:00:02 lr 0.000533 wd 0.0500 time 0.5740 (0.5872) data time 0.0006 (0.0024) model time 0.5735 (0.5858) loss 8.3470 (7.4664) grad_norm 2.4471 (2.4215) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:38:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 170 training takes 0:06:06 +[2024-07-25 07:38:44 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 07:38:46 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 07:38:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.500 (0.500) Loss 0.5161 (0.5161) Acc@1 89.844 (89.844) Acc@5 98.730 (98.730) Mem 22339MB +[2024-07-25 07:38:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.160) Loss 0.7979 (0.6408) Acc@1 80.957 (86.515) Acc@5 96.436 (97.736) Mem 22339MB +[2024-07-25 07:38:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.8994 (0.7445) Acc@1 78.027 (83.443) Acc@5 95.312 (96.766) Mem 22339MB +[2024-07-25 07:38:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.119 Acc@5 96.733 +[2024-07-25 07:38:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.1% +[2024-07-25 07:38:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.993 (0.993) Loss 0.4961 (0.4961) Acc@1 89.990 (89.990) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 07:38:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.204) Loss 0.7622 (0.6229) Acc@1 82.471 (86.879) Acc@5 96.484 (97.856) Mem 22339MB +[2024-07-25 07:38:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.167) Loss 0.8862 (0.7215) Acc@1 78.516 (83.877) Acc@5 95.801 (96.880) Mem 22339MB +[2024-07-25 07:38:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.467 Acc@5 96.885 +[2024-07-25 07:38:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.5% +[2024-07-25 07:38:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][0/625] eta 0:15:53 lr 0.000533 wd 0.0500 time 1.5260 (1.5260) data time 0.5603 (0.5603) model time 0.0000 (0.0000) loss 6.4230 (6.4230) grad_norm 3.3649 (3.3649) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:39:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][10/625] eta 0:06:46 lr 0.000533 wd 0.0500 time 0.5742 (0.6605) data time 0.0007 (0.0517) model time 0.0000 (0.0000) loss 6.7352 (6.9493) grad_norm 2.2075 (2.6410) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:39:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][20/625] eta 0:06:15 lr 0.000533 wd 0.0500 time 0.5731 (0.6213) data time 0.0006 (0.0274) model time 0.0000 (0.0000) loss 5.4636 (7.2942) grad_norm 1.7070 (2.6534) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:39:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][30/625] eta 0:06:00 lr 0.000533 wd 0.0500 time 0.5700 (0.6065) data time 0.0008 (0.0188) model time 0.0000 (0.0000) loss 5.9282 (7.1317) grad_norm 2.2969 (2.7273) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:39:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][40/625] eta 0:05:50 lr 0.000532 wd 0.0500 time 0.5760 (0.5988) data time 0.0006 (0.0144) model time 0.0000 (0.0000) loss 6.2119 (7.2904) grad_norm 2.0678 (2.5976) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:39:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][50/625] eta 0:05:41 lr 0.000532 wd 0.0500 time 0.5737 (0.5940) data time 0.0008 (0.0118) model time 0.0000 (0.0000) loss 6.4066 (7.3435) grad_norm 1.6194 (2.4925) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:39:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][60/625] eta 0:05:33 lr 0.000532 wd 0.0500 time 0.5750 (0.5910) data time 0.0007 (0.0100) model time 0.5744 (0.5751) loss 7.2136 (7.3832) grad_norm 1.8869 (2.4190) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:39:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][70/625] eta 0:05:26 lr 0.000532 wd 0.0500 time 0.5723 (0.5886) data time 0.0006 (0.0087) model time 0.5717 (0.5740) loss 9.1978 (7.4841) grad_norm 1.8644 (2.4146) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:39:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][80/625] eta 0:05:19 lr 0.000532 wd 0.0500 time 0.5677 (0.5869) data time 0.0006 (0.0077) model time 0.5671 (0.5739) loss 7.0157 (7.4660) grad_norm 1.6577 (2.3823) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:39:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][90/625] eta 0:05:13 lr 0.000532 wd 0.0500 time 0.5734 (0.5853) data time 0.0006 (0.0069) model time 0.5728 (0.5735) loss 8.0948 (7.4785) grad_norm 1.7720 (2.3218) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:39:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][100/625] eta 0:05:06 lr 0.000532 wd 0.0500 time 0.5749 (0.5842) data time 0.0008 (0.0063) model time 0.5741 (0.5734) loss 7.2693 (7.4991) grad_norm 2.0236 (2.3178) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:39:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][110/625] eta 0:05:00 lr 0.000532 wd 0.0500 time 0.5693 (0.5842) data time 0.0007 (0.0058) model time 0.5686 (0.5750) loss 7.3359 (7.5103) grad_norm 2.6760 (2.3571) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:40:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][120/625] eta 0:04:56 lr 0.000532 wd 0.0500 time 0.7600 (0.5876) data time 0.0006 (0.0054) model time 0.7594 (0.5822) loss 5.7761 (7.5201) grad_norm 3.4818 (2.3940) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:40:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][130/625] eta 0:04:52 lr 0.000531 wd 0.0500 time 0.7586 (0.5912) data time 0.0008 (0.0051) model time 0.7578 (0.5886) loss 6.7196 (7.5003) grad_norm 1.6974 (2.4100) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:40:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][140/625] eta 0:04:48 lr 0.000531 wd 0.0500 time 0.7086 (0.5948) data time 0.0007 (0.0048) model time 0.7080 (0.5944) loss 7.6416 (7.4705) grad_norm 1.7972 (2.4132) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:40:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][150/625] eta 0:04:43 lr 0.000531 wd 0.0500 time 0.5737 (0.5967) data time 0.0007 (0.0045) model time 0.5730 (0.5973) loss 7.0782 (7.4521) grad_norm 1.6259 (2.3775) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:40:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][160/625] eta 0:04:37 lr 0.000531 wd 0.0500 time 0.5724 (0.5966) data time 0.0008 (0.0043) model time 0.5716 (0.5970) loss 7.8230 (7.4226) grad_norm 2.1891 (2.3588) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:40:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][170/625] eta 0:04:30 lr 0.000531 wd 0.0500 time 0.5744 (0.5953) data time 0.0008 (0.0041) model time 0.5736 (0.5951) loss 7.6848 (7.4259) grad_norm 3.0987 (2.3621) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:40:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][180/625] eta 0:04:24 lr 0.000531 wd 0.0500 time 0.5796 (0.5942) data time 0.0008 (0.0039) model time 0.5788 (0.5934) loss 8.3904 (7.4217) grad_norm 1.7487 (2.3396) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:40:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][190/625] eta 0:04:18 lr 0.000531 wd 0.0500 time 0.5780 (0.5932) data time 0.0006 (0.0037) model time 0.5773 (0.5921) loss 6.4381 (7.4254) grad_norm 1.9872 (2.3108) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:40:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][200/625] eta 0:04:11 lr 0.000531 wd 0.0500 time 0.5717 (0.5922) data time 0.0006 (0.0036) model time 0.5711 (0.5907) loss 8.3909 (7.4185) grad_norm 1.6997 (2.3526) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:40:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][210/625] eta 0:04:05 lr 0.000531 wd 0.0500 time 0.5706 (0.5913) data time 0.0008 (0.0035) model time 0.5698 (0.5896) loss 8.3933 (7.4350) grad_norm 5.4399 (2.3722) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:41:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][220/625] eta 0:03:59 lr 0.000531 wd 0.0500 time 0.5748 (0.5905) data time 0.0008 (0.0033) model time 0.5740 (0.5886) loss 6.4276 (7.4345) grad_norm 2.2219 (2.3822) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:41:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][230/625] eta 0:03:52 lr 0.000530 wd 0.0500 time 0.5794 (0.5899) data time 0.0006 (0.0032) model time 0.5787 (0.5879) loss 5.4779 (7.4471) grad_norm 2.7948 (2.4014) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:41:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][240/625] eta 0:03:46 lr 0.000530 wd 0.0500 time 0.5832 (0.5893) data time 0.0009 (0.0031) model time 0.5823 (0.5872) loss 8.8684 (7.4555) grad_norm 3.9596 (2.4204) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:41:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][250/625] eta 0:03:40 lr 0.000530 wd 0.0500 time 0.5791 (0.5887) data time 0.0009 (0.0030) model time 0.5782 (0.5866) loss 9.2041 (7.4772) grad_norm 3.2701 (2.4741) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:41:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][260/625] eta 0:03:34 lr 0.000530 wd 0.0500 time 0.5775 (0.5882) data time 0.0006 (0.0029) model time 0.5768 (0.5860) loss 6.5327 (7.4807) grad_norm 2.8387 (2.4930) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:41:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][270/625] eta 0:03:28 lr 0.000530 wd 0.0500 time 0.5829 (0.5877) data time 0.0006 (0.0029) model time 0.5822 (0.5855) loss 7.7532 (7.4852) grad_norm 2.4791 (2.4863) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:41:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][280/625] eta 0:03:22 lr 0.000530 wd 0.0500 time 0.5748 (0.5873) data time 0.0007 (0.0028) model time 0.5742 (0.5850) loss 8.6730 (7.4953) grad_norm 2.1730 (2.4786) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:41:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][290/625] eta 0:03:16 lr 0.000530 wd 0.0500 time 0.5738 (0.5868) data time 0.0008 (0.0027) model time 0.5730 (0.5844) loss 8.9972 (7.5056) grad_norm 1.9324 (2.4656) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:41:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][300/625] eta 0:03:10 lr 0.000530 wd 0.0500 time 0.5757 (0.5864) data time 0.0008 (0.0027) model time 0.5749 (0.5840) loss 7.9587 (7.5166) grad_norm 1.9027 (2.4533) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:41:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][310/625] eta 0:03:04 lr 0.000530 wd 0.0500 time 0.5818 (0.5860) data time 0.0008 (0.0026) model time 0.5810 (0.5837) loss 8.4270 (7.5168) grad_norm 2.4745 (2.4390) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:42:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][320/625] eta 0:02:58 lr 0.000529 wd 0.0500 time 0.6075 (0.5858) data time 0.0008 (0.0026) model time 0.6068 (0.5834) loss 7.9692 (7.5182) grad_norm 2.4840 (2.4597) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:42:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][330/625] eta 0:02:52 lr 0.000529 wd 0.0500 time 0.5766 (0.5859) data time 0.0006 (0.0025) model time 0.5760 (0.5836) loss 7.6386 (7.5138) grad_norm 2.4647 (2.4622) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:42:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][340/625] eta 0:02:47 lr 0.000529 wd 0.0500 time 0.7231 (0.5872) data time 0.0006 (0.0025) model time 0.7225 (0.5852) loss 7.1765 (7.5190) grad_norm 1.7653 (2.4584) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:42:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][350/625] eta 0:02:41 lr 0.000529 wd 0.0500 time 0.7181 (0.5885) data time 0.0007 (0.0024) model time 0.7174 (0.5868) loss 7.3144 (7.5120) grad_norm 2.1720 (2.4557) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:42:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][360/625] eta 0:02:36 lr 0.000529 wd 0.0500 time 0.5771 (0.5896) data time 0.0006 (0.0024) model time 0.5764 (0.5881) loss 7.5085 (7.5004) grad_norm 1.9275 (2.4484) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:42:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][370/625] eta 0:02:30 lr 0.000529 wd 0.0500 time 0.5745 (0.5904) data time 0.0008 (0.0023) model time 0.5737 (0.5890) loss 7.4788 (7.4886) grad_norm 1.9158 (2.4399) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:42:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][380/625] eta 0:02:24 lr 0.000529 wd 0.0500 time 0.5752 (0.5903) data time 0.0006 (0.0023) model time 0.5746 (0.5889) loss 7.3172 (7.4882) grad_norm 2.6848 (2.4449) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:42:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][390/625] eta 0:02:18 lr 0.000529 wd 0.0500 time 0.5762 (0.5898) data time 0.0007 (0.0022) model time 0.5755 (0.5884) loss 8.5420 (7.4816) grad_norm 1.8042 (2.4447) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:42:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][400/625] eta 0:02:12 lr 0.000529 wd 0.0500 time 0.5731 (0.5895) data time 0.0008 (0.0022) model time 0.5723 (0.5880) loss 7.0644 (7.4836) grad_norm 2.9767 (2.4488) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:42:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][410/625] eta 0:02:06 lr 0.000529 wd 0.0500 time 0.5713 (0.5890) data time 0.0009 (0.0022) model time 0.5704 (0.5875) loss 8.5638 (7.4930) grad_norm 1.4310 (2.4400) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 07:43:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][420/625] eta 0:02:00 lr 0.000528 wd 0.0500 time 0.5757 (0.5887) data time 0.0008 (0.0021) model time 0.5750 (0.5871) loss 7.2294 (7.4980) grad_norm 2.7009 (inf) loss_scale 512.0000 (1020.3515) mem 22339MB +[2024-07-25 07:43:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][430/625] eta 0:01:54 lr 0.000528 wd 0.0500 time 0.5792 (0.5883) data time 0.0008 (0.0021) model time 0.5784 (0.5868) loss 7.5236 (7.4953) grad_norm 16.1568 (inf) loss_scale 512.0000 (1008.5568) mem 22339MB +[2024-07-25 07:43:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][440/625] eta 0:01:48 lr 0.000528 wd 0.0500 time 0.5761 (0.5880) data time 0.0008 (0.0021) model time 0.5753 (0.5864) loss 6.9871 (7.4910) grad_norm 1.8956 (inf) loss_scale 512.0000 (997.2971) mem 22339MB +[2024-07-25 07:43:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][450/625] eta 0:01:42 lr 0.000528 wd 0.0500 time 0.5747 (0.5877) data time 0.0006 (0.0021) model time 0.5741 (0.5861) loss 8.1294 (7.4906) grad_norm 1.6816 (inf) loss_scale 512.0000 (986.5366) mem 22339MB +[2024-07-25 07:43:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][460/625] eta 0:01:36 lr 0.000528 wd 0.0500 time 0.5787 (0.5875) data time 0.0006 (0.0020) model time 0.5781 (0.5859) loss 8.2631 (7.4883) grad_norm 2.2226 (inf) loss_scale 512.0000 (976.2430) mem 22339MB +[2024-07-25 07:43:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][470/625] eta 0:01:31 lr 0.000528 wd 0.0500 time 0.5731 (0.5872) data time 0.0007 (0.0020) model time 0.5724 (0.5856) loss 6.9305 (7.4900) grad_norm 1.8266 (inf) loss_scale 512.0000 (966.3864) mem 22339MB +[2024-07-25 07:43:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][480/625] eta 0:01:25 lr 0.000528 wd 0.0500 time 0.5746 (0.5869) data time 0.0008 (0.0020) model time 0.5738 (0.5852) loss 7.2786 (7.4867) grad_norm 3.1619 (inf) loss_scale 512.0000 (956.9397) mem 22339MB +[2024-07-25 07:43:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][490/625] eta 0:01:19 lr 0.000528 wd 0.0500 time 0.5724 (0.5866) data time 0.0006 (0.0020) model time 0.5718 (0.5850) loss 6.5967 (7.4876) grad_norm 1.6015 (inf) loss_scale 512.0000 (947.8778) mem 22339MB +[2024-07-25 07:43:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][500/625] eta 0:01:13 lr 0.000528 wd 0.0500 time 0.5799 (0.5864) data time 0.0006 (0.0019) model time 0.5794 (0.5847) loss 6.5248 (7.4841) grad_norm 2.3066 (inf) loss_scale 512.0000 (939.1776) mem 22339MB +[2024-07-25 07:43:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][510/625] eta 0:01:07 lr 0.000527 wd 0.0500 time 0.5745 (0.5862) data time 0.0006 (0.0019) model time 0.5739 (0.5845) loss 5.8989 (7.4726) grad_norm 2.6374 (inf) loss_scale 512.0000 (930.8180) mem 22339MB +[2024-07-25 07:43:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][520/625] eta 0:01:01 lr 0.000527 wd 0.0500 time 0.5850 (0.5860) data time 0.0008 (0.0019) model time 0.5842 (0.5843) loss 8.7025 (7.4773) grad_norm 3.3004 (inf) loss_scale 512.0000 (922.7793) mem 22339MB +[2024-07-25 07:44:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][530/625] eta 0:00:55 lr 0.000527 wd 0.0500 time 0.5762 (0.5858) data time 0.0006 (0.0019) model time 0.5756 (0.5841) loss 6.7433 (7.4744) grad_norm 2.2484 (inf) loss_scale 512.0000 (915.0433) mem 22339MB +[2024-07-25 07:44:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][540/625] eta 0:00:49 lr 0.000527 wd 0.0500 time 0.5793 (0.5856) data time 0.0008 (0.0018) model time 0.5786 (0.5839) loss 9.8166 (7.4857) grad_norm 2.0250 (inf) loss_scale 512.0000 (907.5933) mem 22339MB +[2024-07-25 07:44:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][550/625] eta 0:00:43 lr 0.000527 wd 0.0500 time 0.5796 (0.5855) data time 0.0006 (0.0018) model time 0.5790 (0.5838) loss 6.1689 (7.4745) grad_norm 2.9990 (inf) loss_scale 512.0000 (900.4138) mem 22339MB +[2024-07-25 07:44:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][560/625] eta 0:00:38 lr 0.000527 wd 0.0500 time 0.7771 (0.5864) data time 0.0006 (0.0018) model time 0.7764 (0.5848) loss 8.7256 (7.4845) grad_norm 3.5294 (inf) loss_scale 512.0000 (893.4902) mem 22339MB +[2024-07-25 07:44:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][570/625] eta 0:00:32 lr 0.000527 wd 0.0500 time 0.6881 (0.5873) data time 0.0007 (0.0018) model time 0.6873 (0.5858) loss 8.0963 (7.4828) grad_norm 1.9585 (inf) loss_scale 512.0000 (886.8091) mem 22339MB +[2024-07-25 07:44:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][580/625] eta 0:00:26 lr 0.000527 wd 0.0500 time 0.6989 (0.5884) data time 0.0008 (0.0018) model time 0.6981 (0.5870) loss 8.0526 (7.4740) grad_norm 2.2014 (inf) loss_scale 512.0000 (880.3580) mem 22339MB +[2024-07-25 07:44:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][590/625] eta 0:00:20 lr 0.000527 wd 0.0500 time 0.5670 (0.5888) data time 0.0009 (0.0018) model time 0.5660 (0.5875) loss 9.1216 (7.4676) grad_norm 3.2666 (inf) loss_scale 512.0000 (874.1252) mem 22339MB +[2024-07-25 07:44:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][600/625] eta 0:00:14 lr 0.000527 wd 0.0500 time 0.5782 (0.5888) data time 0.0006 (0.0017) model time 0.5775 (0.5875) loss 6.2514 (7.4612) grad_norm 3.2618 (inf) loss_scale 512.0000 (868.0998) mem 22339MB +[2024-07-25 07:44:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][610/625] eta 0:00:08 lr 0.000526 wd 0.0500 time 0.5731 (0.5886) data time 0.0006 (0.0017) model time 0.5725 (0.5873) loss 7.4430 (7.4544) grad_norm 2.2130 (inf) loss_scale 512.0000 (862.2717) mem 22339MB +[2024-07-25 07:44:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [171/300][620/625] eta 0:00:02 lr 0.000526 wd 0.0500 time 0.5778 (0.5884) data time 0.0004 (0.0017) model time 0.5774 (0.5871) loss 6.3739 (7.4466) grad_norm 2.1184 (inf) loss_scale 512.0000 (856.6312) mem 22339MB +[2024-07-25 07:45:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 171 training takes 0:06:07 +[2024-07-25 07:45:01 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 07:45:02 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 07:45:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.493 (0.493) Loss 0.5254 (0.5254) Acc@1 89.209 (89.209) Acc@5 98.828 (98.828) Mem 22339MB +[2024-07-25 07:45:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.161) Loss 0.8066 (0.6443) Acc@1 81.348 (86.448) Acc@5 96.191 (97.732) Mem 22339MB +[2024-07-25 07:45:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.145) Loss 0.9189 (0.7474) Acc@1 77.783 (83.422) Acc@5 95.410 (96.735) Mem 22339MB +[2024-07-25 07:45:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.091 Acc@5 96.721 +[2024-07-25 07:45:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.1% +[2024-07-25 07:45:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 2.585 (2.585) Loss 0.4968 (0.4968) Acc@1 89.893 (89.893) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 07:45:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.349) Loss 0.7617 (0.6227) Acc@1 82.422 (86.905) Acc@5 96.484 (97.865) Mem 22339MB +[2024-07-25 07:45:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.242) Loss 0.8862 (0.7212) Acc@1 78.467 (83.896) Acc@5 95.898 (96.898) Mem 22339MB +[2024-07-25 07:45:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.497 Acc@5 96.901 +[2024-07-25 07:45:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.5% +[2024-07-25 07:45:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.50% +[2024-07-25 07:45:11 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 07:45:13 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 07:45:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][0/625] eta 0:26:59 lr 0.000526 wd 0.0500 time 2.5917 (2.5917) data time 2.0736 (2.0736) model time 0.0000 (0.0000) loss 7.7279 (7.7279) grad_norm 2.2591 (2.2591) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:45:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][10/625] eta 0:07:45 lr 0.000526 wd 0.0500 time 0.5817 (0.7567) data time 0.0007 (0.1892) model time 0.0000 (0.0000) loss 7.6655 (7.4849) grad_norm 1.5991 (2.0771) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:45:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][20/625] eta 0:06:45 lr 0.000526 wd 0.0500 time 0.5749 (0.6695) data time 0.0007 (0.0995) model time 0.0000 (0.0000) loss 6.4089 (7.4407) grad_norm 2.2357 (2.0859) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:45:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][30/625] eta 0:06:19 lr 0.000526 wd 0.0500 time 0.5723 (0.6374) data time 0.0007 (0.0676) model time 0.0000 (0.0000) loss 7.3086 (7.5198) grad_norm 5.1955 (2.4308) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:45:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][40/625] eta 0:06:04 lr 0.000526 wd 0.0500 time 0.5775 (0.6228) data time 0.0008 (0.0513) model time 0.0000 (0.0000) loss 7.7712 (7.4332) grad_norm 2.2347 (2.4953) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:45:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][50/625] eta 0:05:53 lr 0.000526 wd 0.0500 time 0.5795 (0.6147) data time 0.0006 (0.0414) model time 0.0000 (0.0000) loss 6.8573 (7.5091) grad_norm 1.8700 (2.5344) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:45:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][60/625] eta 0:05:44 lr 0.000526 wd 0.0500 time 0.5898 (0.6089) data time 0.0008 (0.0348) model time 0.5890 (0.5782) loss 8.2922 (7.4986) grad_norm 2.1976 (2.6507) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:45:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][70/625] eta 0:05:35 lr 0.000526 wd 0.0500 time 0.5776 (0.6041) data time 0.0006 (0.0300) model time 0.5769 (0.5761) loss 6.8719 (7.3756) grad_norm 3.4592 (2.7793) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:46:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][80/625] eta 0:05:27 lr 0.000525 wd 0.0500 time 0.5729 (0.6005) data time 0.0006 (0.0264) model time 0.5723 (0.5756) loss 6.4788 (7.4156) grad_norm 2.2095 (2.7957) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:46:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][90/625] eta 0:05:20 lr 0.000525 wd 0.0500 time 0.7894 (0.6000) data time 0.0006 (0.0236) model time 0.7887 (0.5803) loss 7.7489 (7.4291) grad_norm 1.6429 (2.7996) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:46:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][100/625] eta 0:05:13 lr 0.000525 wd 0.0500 time 0.5727 (0.5967) data time 0.0008 (0.0213) model time 0.5719 (0.5775) loss 8.5663 (7.4373) grad_norm 2.0465 (2.7093) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:46:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][110/625] eta 0:05:06 lr 0.000525 wd 0.0500 time 0.5731 (0.5947) data time 0.0006 (0.0195) model time 0.5725 (0.5770) loss 7.0803 (7.4398) grad_norm 1.9183 (2.6689) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:46:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][120/625] eta 0:04:59 lr 0.000525 wd 0.0500 time 0.5691 (0.5929) data time 0.0006 (0.0179) model time 0.5684 (0.5762) loss 7.6991 (7.4232) grad_norm 1.6472 (2.6295) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:46:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][130/625] eta 0:04:52 lr 0.000525 wd 0.0500 time 0.5722 (0.5914) data time 0.0009 (0.0166) model time 0.5713 (0.5757) loss 8.3447 (7.4421) grad_norm 2.4288 (2.6347) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:46:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][140/625] eta 0:04:46 lr 0.000525 wd 0.0500 time 0.5736 (0.5900) data time 0.0009 (0.0155) model time 0.5727 (0.5752) loss 8.1732 (7.4439) grad_norm 1.9245 (2.6237) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:46:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][150/625] eta 0:04:40 lr 0.000525 wd 0.0500 time 0.7325 (0.5903) data time 0.0006 (0.0145) model time 0.7319 (0.5771) loss 7.7942 (7.4339) grad_norm 1.7898 (2.6211) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:46:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][160/625] eta 0:04:35 lr 0.000525 wd 0.0500 time 0.6154 (0.5924) data time 0.0006 (0.0137) model time 0.6148 (0.5813) loss 7.6573 (7.4166) grad_norm 3.0083 (2.6105) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:46:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][170/625] eta 0:04:30 lr 0.000524 wd 0.0500 time 0.5833 (0.5944) data time 0.0006 (0.0129) model time 0.5827 (0.5849) loss 7.3579 (7.4324) grad_norm 2.1652 (2.6233) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:47:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][180/625] eta 0:04:25 lr 0.000524 wd 0.0500 time 0.5747 (0.5972) data time 0.0008 (0.0123) model time 0.5739 (0.5895) loss 7.1579 (7.4622) grad_norm 1.7194 (2.6160) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:47:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][190/625] eta 0:04:19 lr 0.000524 wd 0.0500 time 0.5709 (0.5970) data time 0.0006 (0.0117) model time 0.5703 (0.5897) loss 7.1845 (7.4632) grad_norm 1.9508 (2.6005) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:47:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][200/625] eta 0:04:13 lr 0.000524 wd 0.0500 time 0.5737 (0.5962) data time 0.0008 (0.0111) model time 0.5729 (0.5890) loss 8.7009 (7.4778) grad_norm 2.4676 (2.5724) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:47:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][210/625] eta 0:04:06 lr 0.000524 wd 0.0500 time 0.5753 (0.5952) data time 0.0006 (0.0106) model time 0.5746 (0.5881) loss 7.9659 (7.4570) grad_norm 2.3834 (2.5611) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:47:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][220/625] eta 0:04:00 lr 0.000524 wd 0.0500 time 0.5732 (0.5942) data time 0.0008 (0.0102) model time 0.5724 (0.5873) loss 7.2802 (7.4577) grad_norm 2.4062 (2.5510) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:47:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][230/625] eta 0:03:54 lr 0.000524 wd 0.0500 time 0.5730 (0.5933) data time 0.0006 (0.0098) model time 0.5724 (0.5864) loss 6.8717 (7.4415) grad_norm 2.6632 (2.5413) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:47:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][240/625] eta 0:03:48 lr 0.000524 wd 0.0500 time 0.5744 (0.5924) data time 0.0006 (0.0094) model time 0.5738 (0.5856) loss 6.5027 (7.4409) grad_norm 1.9864 (2.5312) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:47:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][250/625] eta 0:03:41 lr 0.000524 wd 0.0500 time 0.5746 (0.5917) data time 0.0006 (0.0091) model time 0.5740 (0.5851) loss 6.9336 (7.4482) grad_norm 2.1420 (2.5142) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:47:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][260/625] eta 0:03:35 lr 0.000524 wd 0.0500 time 0.5741 (0.5911) data time 0.0008 (0.0087) model time 0.5733 (0.5846) loss 8.7649 (7.4464) grad_norm 2.6481 (2.5035) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:47:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][270/625] eta 0:03:29 lr 0.000523 wd 0.0500 time 0.5749 (0.5906) data time 0.0008 (0.0085) model time 0.5741 (0.5841) loss 6.1836 (7.4382) grad_norm 1.9828 (2.4944) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:47:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][280/625] eta 0:03:23 lr 0.000523 wd 0.0500 time 0.5720 (0.5901) data time 0.0006 (0.0082) model time 0.5714 (0.5838) loss 7.9372 (7.4509) grad_norm 2.8395 (2.4843) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:48:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][290/625] eta 0:03:17 lr 0.000523 wd 0.0500 time 0.5732 (0.5896) data time 0.0009 (0.0079) model time 0.5723 (0.5834) loss 6.3596 (7.4522) grad_norm 2.4029 (2.4840) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:48:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][300/625] eta 0:03:11 lr 0.000523 wd 0.0500 time 0.5725 (0.5891) data time 0.0008 (0.0077) model time 0.5717 (0.5830) loss 6.9434 (7.4342) grad_norm 2.8298 (2.4899) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:48:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][310/625] eta 0:03:05 lr 0.000523 wd 0.0500 time 0.5228 (0.5888) data time 0.0007 (0.0075) model time 0.5221 (0.5829) loss 6.3617 (7.4469) grad_norm 2.0626 (2.4778) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:48:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][320/625] eta 0:02:59 lr 0.000523 wd 0.0500 time 0.5929 (0.5884) data time 0.0007 (0.0073) model time 0.5922 (0.5826) loss 6.6586 (7.4308) grad_norm 2.2691 (2.4736) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:48:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][330/625] eta 0:02:53 lr 0.000523 wd 0.0500 time 0.5686 (0.5878) data time 0.0008 (0.0071) model time 0.5678 (0.5821) loss 7.6700 (7.4273) grad_norm 2.2796 (2.4657) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:48:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][340/625] eta 0:02:47 lr 0.000523 wd 0.0500 time 0.5746 (0.5874) data time 0.0006 (0.0069) model time 0.5741 (0.5818) loss 7.4864 (7.4235) grad_norm 2.2486 (2.4575) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:48:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][350/625] eta 0:02:41 lr 0.000523 wd 0.0500 time 0.5730 (0.5870) data time 0.0006 (0.0067) model time 0.5724 (0.5815) loss 7.2562 (7.4290) grad_norm 1.8845 (2.4536) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:48:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][360/625] eta 0:02:35 lr 0.000522 wd 0.0500 time 0.5758 (0.5867) data time 0.0006 (0.0065) model time 0.5752 (0.5813) loss 8.9606 (7.4338) grad_norm 2.1660 (2.4493) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:48:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][370/625] eta 0:02:29 lr 0.000522 wd 0.0500 time 0.5781 (0.5869) data time 0.0008 (0.0064) model time 0.5773 (0.5817) loss 7.8728 (7.4341) grad_norm 2.9529 (2.4460) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:48:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][380/625] eta 0:02:24 lr 0.000522 wd 0.0500 time 0.5752 (0.5882) data time 0.0006 (0.0062) model time 0.5746 (0.5833) loss 7.5069 (7.4237) grad_norm 1.7876 (2.4343) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:49:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][390/625] eta 0:02:18 lr 0.000522 wd 0.0500 time 0.7184 (0.5895) data time 0.0006 (0.0061) model time 0.7178 (0.5850) loss 8.4934 (7.4342) grad_norm 3.0449 (2.4530) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:49:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][400/625] eta 0:02:13 lr 0.000522 wd 0.0500 time 0.5728 (0.5911) data time 0.0008 (0.0060) model time 0.5720 (0.5869) loss 7.8694 (7.4331) grad_norm 2.6669 (2.4625) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:49:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][410/625] eta 0:02:07 lr 0.000522 wd 0.0500 time 0.7240 (0.5916) data time 0.0007 (0.0058) model time 0.7233 (0.5875) loss 5.9184 (7.4160) grad_norm 1.9289 (2.4597) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:49:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][420/625] eta 0:02:01 lr 0.000522 wd 0.0500 time 0.5698 (0.5912) data time 0.0007 (0.0057) model time 0.5690 (0.5872) loss 5.9876 (7.4195) grad_norm 2.0464 (2.4598) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:49:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][430/625] eta 0:01:55 lr 0.000522 wd 0.0500 time 0.5743 (0.5908) data time 0.0008 (0.0056) model time 0.5735 (0.5868) loss 8.3835 (7.4259) grad_norm 1.7093 (2.4577) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:49:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][440/625] eta 0:01:49 lr 0.000522 wd 0.0500 time 0.5812 (0.5904) data time 0.0006 (0.0055) model time 0.5806 (0.5865) loss 6.4181 (7.4273) grad_norm 1.8685 (2.4561) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:49:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][450/625] eta 0:01:43 lr 0.000522 wd 0.0500 time 0.5753 (0.5901) data time 0.0007 (0.0054) model time 0.5746 (0.5861) loss 7.3142 (7.4230) grad_norm 1.5857 (2.4561) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:49:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][460/625] eta 0:01:37 lr 0.000521 wd 0.0500 time 0.5682 (0.5897) data time 0.0008 (0.0053) model time 0.5675 (0.5858) loss 8.2939 (7.4191) grad_norm 2.3707 (2.4489) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:49:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][470/625] eta 0:01:31 lr 0.000521 wd 0.0500 time 0.5708 (0.5893) data time 0.0006 (0.0052) model time 0.5703 (0.5855) loss 6.6320 (7.4126) grad_norm 2.0978 (2.4423) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:49:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][480/625] eta 0:01:25 lr 0.000521 wd 0.0500 time 0.5767 (0.5890) data time 0.0006 (0.0051) model time 0.5761 (0.5852) loss 6.2881 (7.4034) grad_norm 2.5869 (2.4401) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:50:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][490/625] eta 0:01:19 lr 0.000521 wd 0.0500 time 0.5724 (0.5887) data time 0.0007 (0.0050) model time 0.5717 (0.5849) loss 7.7980 (7.4104) grad_norm 4.3635 (2.4432) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:50:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][500/625] eta 0:01:13 lr 0.000521 wd 0.0500 time 0.5737 (0.5885) data time 0.0008 (0.0049) model time 0.5730 (0.5847) loss 8.7535 (7.4127) grad_norm 1.6472 (2.4472) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:50:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][510/625] eta 0:01:07 lr 0.000521 wd 0.0500 time 0.5731 (0.5882) data time 0.0008 (0.0049) model time 0.5723 (0.5844) loss 6.3542 (7.4101) grad_norm 1.9157 (2.4410) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:50:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][520/625] eta 0:01:01 lr 0.000521 wd 0.0500 time 0.5727 (0.5879) data time 0.0006 (0.0048) model time 0.5720 (0.5842) loss 6.8513 (7.4046) grad_norm 1.6387 (2.4356) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:50:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][530/625] eta 0:00:55 lr 0.000521 wd 0.0500 time 0.7081 (0.5879) data time 0.0007 (0.0047) model time 0.7074 (0.5842) loss 7.3317 (7.4052) grad_norm 2.4039 (2.4335) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:50:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][540/625] eta 0:00:49 lr 0.000521 wd 0.0500 time 0.5712 (0.5876) data time 0.0008 (0.0046) model time 0.5704 (0.5840) loss 8.1503 (7.4027) grad_norm 1.8204 (2.4275) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:50:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][550/625] eta 0:00:44 lr 0.000520 wd 0.0500 time 0.5748 (0.5873) data time 0.0007 (0.0046) model time 0.5741 (0.5837) loss 5.4448 (7.3958) grad_norm 1.8788 (2.4198) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:50:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][560/625] eta 0:00:38 lr 0.000520 wd 0.0500 time 0.5825 (0.5872) data time 0.0006 (0.0045) model time 0.5819 (0.5836) loss 6.6092 (7.3958) grad_norm 1.7381 (2.4090) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:50:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][570/625] eta 0:00:32 lr 0.000520 wd 0.0500 time 0.5778 (0.5870) data time 0.0008 (0.0044) model time 0.5771 (0.5835) loss 7.9390 (7.3971) grad_norm 1.7402 (2.4050) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:50:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][580/625] eta 0:00:26 lr 0.000520 wd 0.0500 time 0.5816 (0.5869) data time 0.0008 (0.0044) model time 0.5808 (0.5834) loss 8.8147 (7.3972) grad_norm 2.1606 (2.4077) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:50:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][590/625] eta 0:00:20 lr 0.000520 wd 0.0500 time 0.7383 (0.5871) data time 0.0006 (0.0043) model time 0.7377 (0.5837) loss 6.9922 (7.3950) grad_norm 2.2282 (2.4071) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:51:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][600/625] eta 0:00:14 lr 0.000520 wd 0.0500 time 0.7614 (0.5883) data time 0.0007 (0.0042) model time 0.7608 (0.5851) loss 8.7150 (7.3989) grad_norm 2.1274 (2.4077) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:51:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][610/625] eta 0:00:08 lr 0.000520 wd 0.0500 time 0.7559 (0.5889) data time 0.0006 (0.0042) model time 0.7554 (0.5858) loss 6.9683 (7.3975) grad_norm 3.4981 (2.4085) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:51:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [172/300][620/625] eta 0:00:02 lr 0.000520 wd 0.0500 time 0.5700 (0.5892) data time 0.0006 (0.0041) model time 0.5694 (0.5861) loss 8.6749 (7.3996) grad_norm 2.0341 (2.4073) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:51:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 172 training takes 0:06:08 +[2024-07-25 07:51:21 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 07:51:22 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 07:51:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.488 (0.488) Loss 0.5107 (0.5107) Acc@1 89.600 (89.600) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-25 07:51:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.7617 (0.6242) Acc@1 82.861 (86.705) Acc@5 96.533 (97.834) Mem 22339MB +[2024-07-25 07:51:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.8760 (0.7299) Acc@1 78.223 (83.601) Acc@5 95.703 (96.791) Mem 22339MB +[2024-07-25 07:51:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.333 Acc@5 96.763 +[2024-07-25 07:51:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.3% +[2024-07-25 07:51:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 83.33% +[2024-07-25 07:51:26 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 07:51:27 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 07:51:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.481 (0.481) Loss 0.4968 (0.4968) Acc@1 89.990 (89.990) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-25 07:51:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.158) Loss 0.7617 (0.6226) Acc@1 82.373 (86.919) Acc@5 96.436 (97.865) Mem 22339MB +[2024-07-25 07:51:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.8853 (0.7210) Acc@1 78.516 (83.898) Acc@5 95.898 (96.908) Mem 22339MB +[2024-07-25 07:51:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.503 Acc@5 96.903 +[2024-07-25 07:51:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.5% +[2024-07-25 07:51:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.50% +[2024-07-25 07:51:31 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 07:51:32 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 07:51:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][0/625] eta 0:09:05 lr 0.000520 wd 0.0500 time 0.8731 (0.8731) data time 0.3560 (0.3560) model time 0.0000 (0.0000) loss 6.7917 (6.7917) grad_norm 2.4085 (2.4085) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:51:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][10/625] eta 0:06:24 lr 0.000520 wd 0.0500 time 0.5703 (0.6245) data time 0.0006 (0.0331) model time 0.0000 (0.0000) loss 7.8734 (7.2038) grad_norm 1.6054 (2.4109) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:51:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][20/625] eta 0:06:02 lr 0.000519 wd 0.0500 time 0.5649 (0.5996) data time 0.0006 (0.0177) model time 0.0000 (0.0000) loss 7.1080 (7.4354) grad_norm 3.1136 (2.3789) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:51:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][30/625] eta 0:05:51 lr 0.000519 wd 0.0500 time 0.5691 (0.5910) data time 0.0006 (0.0122) model time 0.0000 (0.0000) loss 7.6834 (7.4101) grad_norm 1.8081 (2.3415) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:51:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][40/625] eta 0:05:45 lr 0.000519 wd 0.0500 time 0.5741 (0.5903) data time 0.0008 (0.0094) model time 0.0000 (0.0000) loss 7.5047 (7.5677) grad_norm 1.8645 (2.2634) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:52:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][50/625] eta 0:05:37 lr 0.000519 wd 0.0500 time 0.5732 (0.5874) data time 0.0009 (0.0078) model time 0.0000 (0.0000) loss 7.6597 (7.5940) grad_norm 2.0487 (2.1858) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:52:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][60/625] eta 0:05:30 lr 0.000519 wd 0.0500 time 0.5652 (0.5850) data time 0.0009 (0.0066) model time 0.5644 (0.5720) loss 8.1054 (7.6766) grad_norm 2.1457 (2.3856) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:52:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][70/625] eta 0:05:23 lr 0.000519 wd 0.0500 time 0.5732 (0.5830) data time 0.0007 (0.0058) model time 0.5725 (0.5707) loss 6.7824 (7.6577) grad_norm 2.6755 (2.4771) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:52:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][80/625] eta 0:05:17 lr 0.000519 wd 0.0500 time 0.5731 (0.5817) data time 0.0006 (0.0052) model time 0.5725 (0.5711) loss 6.3005 (7.5797) grad_norm 1.9471 (2.4564) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:52:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][90/625] eta 0:05:10 lr 0.000519 wd 0.0500 time 0.5674 (0.5808) data time 0.0008 (0.0047) model time 0.5666 (0.5715) loss 6.2506 (7.5754) grad_norm 3.3696 (2.4092) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:52:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][100/625] eta 0:05:04 lr 0.000519 wd 0.0500 time 0.5665 (0.5799) data time 0.0006 (0.0044) model time 0.5659 (0.5714) loss 6.4688 (7.5679) grad_norm 2.2039 (2.3795) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:52:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][110/625] eta 0:04:58 lr 0.000519 wd 0.0500 time 0.5721 (0.5793) data time 0.0006 (0.0040) model time 0.5715 (0.5716) loss 7.0338 (7.5267) grad_norm 2.2735 (2.3465) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:52:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][120/625] eta 0:04:52 lr 0.000518 wd 0.0500 time 0.5676 (0.5789) data time 0.0006 (0.0038) model time 0.5670 (0.5718) loss 7.3709 (7.5095) grad_norm 1.7429 (2.3233) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:52:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][130/625] eta 0:04:46 lr 0.000518 wd 0.0500 time 0.5714 (0.5785) data time 0.0006 (0.0035) model time 0.5707 (0.5720) loss 8.5530 (7.5135) grad_norm 2.2414 (2.3163) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:52:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][140/625] eta 0:04:40 lr 0.000518 wd 0.0500 time 0.5725 (0.5782) data time 0.0008 (0.0033) model time 0.5716 (0.5722) loss 7.3953 (7.5124) grad_norm 2.4387 (2.3100) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:53:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][150/625] eta 0:04:34 lr 0.000518 wd 0.0500 time 0.5736 (0.5780) data time 0.0007 (0.0032) model time 0.5729 (0.5724) loss 7.6707 (7.5340) grad_norm 1.6087 (2.3006) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:53:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][160/625] eta 0:04:28 lr 0.000518 wd 0.0500 time 0.5697 (0.5776) data time 0.0007 (0.0030) model time 0.5690 (0.5722) loss 7.9035 (7.4954) grad_norm 1.9076 (2.2910) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:53:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][170/625] eta 0:04:22 lr 0.000518 wd 0.0500 time 0.5747 (0.5774) data time 0.0006 (0.0029) model time 0.5740 (0.5724) loss 6.9924 (7.4962) grad_norm 5.0975 (2.3153) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:53:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][180/625] eta 0:04:17 lr 0.000518 wd 0.0500 time 0.5714 (0.5780) data time 0.0008 (0.0028) model time 0.5706 (0.5735) loss 6.1032 (7.4928) grad_norm 2.1698 (2.3107) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:53:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][190/625] eta 0:04:11 lr 0.000518 wd 0.0500 time 0.5734 (0.5792) data time 0.0009 (0.0027) model time 0.5725 (0.5754) loss 6.2698 (7.4587) grad_norm 3.1143 (2.3070) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:53:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][200/625] eta 0:04:08 lr 0.000518 wd 0.0500 time 0.5722 (0.5843) data time 0.0008 (0.0026) model time 0.5714 (0.5824) loss 8.0200 (7.4655) grad_norm 2.9755 (2.3232) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:53:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][210/625] eta 0:04:03 lr 0.000517 wd 0.0500 time 0.5717 (0.5872) data time 0.0006 (0.0025) model time 0.5711 (0.5863) loss 8.0777 (7.4683) grad_norm 1.7270 (2.3294) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:53:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][220/625] eta 0:03:58 lr 0.000517 wd 0.0500 time 0.6726 (0.5884) data time 0.0006 (0.0024) model time 0.6720 (0.5879) loss 7.5079 (7.4594) grad_norm 2.3162 (2.3432) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:53:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][230/625] eta 0:03:52 lr 0.000517 wd 0.0500 time 0.5718 (0.5887) data time 0.0008 (0.0024) model time 0.5710 (0.5882) loss 7.9344 (7.4508) grad_norm 3.1395 (2.3610) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:53:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][240/625] eta 0:03:46 lr 0.000517 wd 0.0500 time 0.5714 (0.5880) data time 0.0006 (0.0023) model time 0.5708 (0.5874) loss 7.3571 (7.4446) grad_norm 2.1014 (2.3896) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:54:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][250/625] eta 0:03:40 lr 0.000517 wd 0.0500 time 0.5707 (0.5875) data time 0.0007 (0.0022) model time 0.5700 (0.5867) loss 6.9587 (7.4561) grad_norm 2.6860 (2.4101) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:54:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][260/625] eta 0:03:34 lr 0.000517 wd 0.0500 time 0.5718 (0.5873) data time 0.0008 (0.0022) model time 0.5710 (0.5864) loss 8.9136 (7.4795) grad_norm 3.1915 (2.4142) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:54:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][270/625] eta 0:03:28 lr 0.000517 wd 0.0500 time 0.5715 (0.5868) data time 0.0006 (0.0021) model time 0.5709 (0.5858) loss 7.1762 (7.4888) grad_norm 1.9172 (2.4042) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:54:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][280/625] eta 0:03:22 lr 0.000517 wd 0.0500 time 0.5724 (0.5863) data time 0.0008 (0.0021) model time 0.5716 (0.5852) loss 7.4380 (7.4852) grad_norm 1.8924 (2.4188) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:54:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][290/625] eta 0:03:16 lr 0.000517 wd 0.0500 time 0.5721 (0.5859) data time 0.0006 (0.0020) model time 0.5715 (0.5848) loss 7.4198 (7.4931) grad_norm 2.0555 (2.4202) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:54:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][300/625] eta 0:03:10 lr 0.000517 wd 0.0500 time 0.5682 (0.5855) data time 0.0006 (0.0020) model time 0.5676 (0.5843) loss 6.1381 (7.4864) grad_norm 1.8986 (2.4090) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:54:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][310/625] eta 0:03:04 lr 0.000516 wd 0.0500 time 0.5648 (0.5851) data time 0.0008 (0.0020) model time 0.5640 (0.5838) loss 9.2785 (7.4864) grad_norm 1.9372 (2.4043) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:54:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][320/625] eta 0:02:58 lr 0.000516 wd 0.0500 time 0.5698 (0.5848) data time 0.0006 (0.0019) model time 0.5692 (0.5835) loss 7.8872 (7.4793) grad_norm 2.3152 (2.3955) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:54:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][330/625] eta 0:02:52 lr 0.000516 wd 0.0500 time 0.5733 (0.5845) data time 0.0008 (0.0019) model time 0.5725 (0.5831) loss 7.4911 (7.4675) grad_norm 3.3234 (2.3960) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:54:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][340/625] eta 0:02:46 lr 0.000516 wd 0.0500 time 0.5735 (0.5842) data time 0.0008 (0.0019) model time 0.5727 (0.5828) loss 7.1988 (7.4674) grad_norm 2.1550 (2.3849) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:54:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][350/625] eta 0:02:40 lr 0.000516 wd 0.0500 time 0.5724 (0.5839) data time 0.0006 (0.0018) model time 0.5718 (0.5825) loss 8.9621 (7.4811) grad_norm 1.6681 (2.3659) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:55:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][360/625] eta 0:02:34 lr 0.000516 wd 0.0500 time 0.5724 (0.5837) data time 0.0009 (0.0018) model time 0.5715 (0.5822) loss 5.5485 (7.4790) grad_norm 2.4029 (2.3589) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:55:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][370/625] eta 0:02:28 lr 0.000516 wd 0.0500 time 0.5752 (0.5835) data time 0.0008 (0.0018) model time 0.5744 (0.5820) loss 8.4921 (7.4807) grad_norm 2.5063 (2.3660) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:55:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][380/625] eta 0:02:22 lr 0.000516 wd 0.0500 time 0.5713 (0.5832) data time 0.0006 (0.0017) model time 0.5706 (0.5818) loss 8.1253 (7.4904) grad_norm 2.5171 (2.3603) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:55:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][390/625] eta 0:02:17 lr 0.000516 wd 0.0500 time 0.5633 (0.5830) data time 0.0007 (0.0017) model time 0.5626 (0.5816) loss 7.8067 (7.4750) grad_norm 2.2210 (2.3632) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:55:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][400/625] eta 0:02:11 lr 0.000515 wd 0.0500 time 0.5654 (0.5831) data time 0.0008 (0.0017) model time 0.5646 (0.5816) loss 7.5758 (7.4736) grad_norm 2.4914 (2.3605) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:55:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][410/625] eta 0:02:05 lr 0.000515 wd 0.0500 time 0.7194 (0.5837) data time 0.0007 (0.0017) model time 0.7187 (0.5824) loss 8.4251 (7.4733) grad_norm 2.1061 (2.3595) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:55:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][420/625] eta 0:02:00 lr 0.000515 wd 0.0500 time 0.5719 (0.5854) data time 0.0008 (0.0017) model time 0.5711 (0.5843) loss 7.5617 (7.4733) grad_norm 1.8830 (2.3560) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:55:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][430/625] eta 0:01:54 lr 0.000515 wd 0.0500 time 0.5671 (0.5868) data time 0.0008 (0.0016) model time 0.5663 (0.5859) loss 7.9994 (7.4809) grad_norm 3.3278 (2.3600) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:55:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][440/625] eta 0:01:48 lr 0.000515 wd 0.0500 time 0.7293 (0.5874) data time 0.0006 (0.0016) model time 0.7287 (0.5866) loss 8.2504 (7.4833) grad_norm 2.3031 (2.3547) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:55:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][450/625] eta 0:01:42 lr 0.000515 wd 0.0500 time 0.5689 (0.5872) data time 0.0007 (0.0016) model time 0.5682 (0.5864) loss 7.8572 (7.4833) grad_norm 2.6966 (2.3539) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:56:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][460/625] eta 0:01:36 lr 0.000515 wd 0.0500 time 0.5737 (0.5869) data time 0.0007 (0.0016) model time 0.5730 (0.5861) loss 6.1268 (7.4824) grad_norm 2.7760 (2.3515) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:56:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][470/625] eta 0:01:30 lr 0.000515 wd 0.0500 time 0.5679 (0.5866) data time 0.0008 (0.0016) model time 0.5671 (0.5857) loss 5.9950 (7.4690) grad_norm 2.1490 (2.3501) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:56:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][480/625] eta 0:01:25 lr 0.000515 wd 0.0500 time 0.5673 (0.5865) data time 0.0008 (0.0016) model time 0.5665 (0.5856) loss 7.9493 (7.4625) grad_norm 4.3087 (2.3660) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:56:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][490/625] eta 0:01:19 lr 0.000515 wd 0.0500 time 0.5669 (0.5862) data time 0.0006 (0.0015) model time 0.5664 (0.5853) loss 7.5959 (7.4618) grad_norm 2.2816 (2.3670) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:56:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][500/625] eta 0:01:13 lr 0.000514 wd 0.0500 time 0.5721 (0.5859) data time 0.0008 (0.0015) model time 0.5713 (0.5850) loss 7.1563 (7.4669) grad_norm 1.4298 (2.3604) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:56:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][510/625] eta 0:01:07 lr 0.000514 wd 0.0500 time 0.5695 (0.5857) data time 0.0006 (0.0015) model time 0.5688 (0.5847) loss 7.9124 (7.4714) grad_norm 2.0416 (2.3543) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:56:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][520/625] eta 0:01:01 lr 0.000514 wd 0.0500 time 0.5680 (0.5854) data time 0.0008 (0.0015) model time 0.5672 (0.5843) loss 6.6311 (7.4729) grad_norm 2.3719 (2.3522) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:56:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][530/625] eta 0:00:55 lr 0.000514 wd 0.0500 time 0.5660 (0.5851) data time 0.0006 (0.0015) model time 0.5654 (0.5841) loss 8.6430 (7.4756) grad_norm 3.0172 (2.3639) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:56:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][540/625] eta 0:00:49 lr 0.000514 wd 0.0500 time 0.5725 (0.5849) data time 0.0008 (0.0015) model time 0.5717 (0.5838) loss 7.1985 (7.4835) grad_norm 3.3323 (2.3738) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:56:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][550/625] eta 0:00:43 lr 0.000514 wd 0.0500 time 0.5763 (0.5847) data time 0.0006 (0.0015) model time 0.5757 (0.5836) loss 7.4576 (7.4842) grad_norm 3.9660 (2.3838) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:57:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][560/625] eta 0:00:37 lr 0.000514 wd 0.0500 time 0.5625 (0.5846) data time 0.0008 (0.0015) model time 0.5618 (0.5835) loss 6.9111 (7.4758) grad_norm 3.0671 (2.3917) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:57:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][570/625] eta 0:00:32 lr 0.000514 wd 0.0500 time 0.5765 (0.5844) data time 0.0006 (0.0015) model time 0.5759 (0.5833) loss 6.9757 (7.4793) grad_norm 3.0247 (2.4273) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:57:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][580/625] eta 0:00:26 lr 0.000514 wd 0.0500 time 0.5778 (0.5842) data time 0.0007 (0.0014) model time 0.5771 (0.5831) loss 6.8109 (7.4776) grad_norm 3.5672 (2.4540) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:57:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][590/625] eta 0:00:20 lr 0.000513 wd 0.0500 time 0.5694 (0.5841) data time 0.0008 (0.0014) model time 0.5686 (0.5829) loss 6.9234 (7.4822) grad_norm 2.5155 (2.4603) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:57:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][600/625] eta 0:00:14 lr 0.000513 wd 0.0500 time 0.5678 (0.5839) data time 0.0008 (0.0014) model time 0.5670 (0.5828) loss 7.9030 (7.4872) grad_norm 1.7052 (2.4539) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:57:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][610/625] eta 0:00:08 lr 0.000513 wd 0.0500 time 0.5732 (0.5837) data time 0.0006 (0.0014) model time 0.5726 (0.5826) loss 7.1458 (7.4778) grad_norm 2.5504 (2.4542) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:57:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [173/300][620/625] eta 0:00:02 lr 0.000513 wd 0.0500 time 0.5768 (0.5838) data time 0.0006 (0.0014) model time 0.5762 (0.5826) loss 7.4219 (7.4804) grad_norm 2.5437 (2.4526) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:57:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 173 training takes 0:06:04 +[2024-07-25 07:57:37 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 07:57:39 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 07:57:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.482 (0.482) Loss 0.5044 (0.5044) Acc@1 89.111 (89.111) Acc@5 98.682 (98.682) Mem 22339MB +[2024-07-25 07:57:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.8213 (0.6326) Acc@1 80.811 (86.381) Acc@5 95.557 (97.696) Mem 22339MB +[2024-07-25 07:57:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.143) Loss 0.8945 (0.7349) Acc@1 77.881 (83.389) Acc@5 95.166 (96.652) Mem 22339MB +[2024-07-25 07:57:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.051 Acc@5 96.645 +[2024-07-25 07:57:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.1% +[2024-07-25 07:57:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.835 (0.835) Loss 0.4973 (0.4973) Acc@1 89.844 (89.844) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-25 07:57:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.193) Loss 0.7607 (0.6224) Acc@1 82.471 (86.905) Acc@5 96.484 (97.887) Mem 22339MB +[2024-07-25 07:57:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.161) Loss 0.8838 (0.7205) Acc@1 78.662 (83.896) Acc@5 95.898 (96.917) Mem 22339MB +[2024-07-25 07:57:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.503 Acc@5 96.915 +[2024-07-25 07:57:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.5% +[2024-07-25 07:57:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][0/625] eta 0:17:22 lr 0.000513 wd 0.0500 time 1.6686 (1.6686) data time 0.6496 (0.6496) model time 0.0000 (0.0000) loss 8.6991 (8.6991) grad_norm 2.6409 (2.6409) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:57:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][10/625] eta 0:07:21 lr 0.000513 wd 0.0500 time 0.7622 (0.7171) data time 0.0007 (0.0598) model time 0.0000 (0.0000) loss 7.9090 (7.3863) grad_norm 3.4273 (2.5269) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:58:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][20/625] eta 0:06:54 lr 0.000513 wd 0.0500 time 0.5734 (0.6846) data time 0.0008 (0.0317) model time 0.0000 (0.0000) loss 8.1746 (7.5057) grad_norm 2.6117 (2.4884) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:58:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][30/625] eta 0:06:35 lr 0.000513 wd 0.0500 time 0.5739 (0.6655) data time 0.0008 (0.0217) model time 0.0000 (0.0000) loss 8.3229 (7.5030) grad_norm 2.0428 (2.3670) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:58:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][40/625] eta 0:06:21 lr 0.000513 wd 0.0500 time 0.5218 (0.6524) data time 0.0008 (0.0167) model time 0.0000 (0.0000) loss 8.2134 (7.5110) grad_norm 4.1132 (2.3602) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:58:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][50/625] eta 0:06:06 lr 0.000513 wd 0.0500 time 0.5756 (0.6377) data time 0.0006 (0.0135) model time 0.0000 (0.0000) loss 6.9686 (7.5053) grad_norm 1.5679 (2.3216) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:58:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][60/625] eta 0:05:54 lr 0.000512 wd 0.0500 time 0.5699 (0.6272) data time 0.0007 (0.0115) model time 0.5693 (0.5726) loss 7.5165 (7.5466) grad_norm 1.8643 (2.2785) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:58:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][70/625] eta 0:05:43 lr 0.000512 wd 0.0500 time 0.5720 (0.6194) data time 0.0006 (0.0100) model time 0.5714 (0.5720) loss 8.1190 (7.5732) grad_norm 1.6679 (2.3561) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:58:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][80/625] eta 0:05:34 lr 0.000512 wd 0.0500 time 0.5744 (0.6136) data time 0.0008 (0.0088) model time 0.5737 (0.5718) loss 8.1667 (7.5822) grad_norm 2.7881 (2.3685) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:58:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][90/625] eta 0:05:25 lr 0.000512 wd 0.0500 time 0.5771 (0.6092) data time 0.0008 (0.0079) model time 0.5763 (0.5720) loss 7.0812 (7.5667) grad_norm 2.0601 (2.3190) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:58:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][100/625] eta 0:05:17 lr 0.000512 wd 0.0500 time 0.5745 (0.6057) data time 0.0008 (0.0072) model time 0.5738 (0.5722) loss 6.5015 (7.4890) grad_norm 2.0227 (2.2925) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:58:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][110/625] eta 0:05:10 lr 0.000512 wd 0.0500 time 0.5787 (0.6029) data time 0.0006 (0.0067) model time 0.5781 (0.5725) loss 6.1905 (7.4925) grad_norm 4.5178 (2.3116) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:58:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][120/625] eta 0:05:03 lr 0.000512 wd 0.0500 time 0.5746 (0.6006) data time 0.0008 (0.0062) model time 0.5738 (0.5727) loss 6.9986 (7.4878) grad_norm 1.9358 (2.3259) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:59:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][130/625] eta 0:04:56 lr 0.000512 wd 0.0500 time 0.5843 (0.5987) data time 0.0007 (0.0058) model time 0.5835 (0.5730) loss 7.9358 (7.4769) grad_norm 2.3218 (2.3463) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:59:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][140/625] eta 0:04:49 lr 0.000512 wd 0.0500 time 0.5832 (0.5971) data time 0.0007 (0.0054) model time 0.5825 (0.5733) loss 8.5323 (7.4790) grad_norm 2.1287 (2.3351) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:59:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][150/625] eta 0:04:42 lr 0.000511 wd 0.0500 time 0.5736 (0.5956) data time 0.0008 (0.0051) model time 0.5728 (0.5733) loss 7.7931 (7.4539) grad_norm 2.0608 (2.3501) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:59:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][160/625] eta 0:04:36 lr 0.000511 wd 0.0500 time 0.5776 (0.5943) data time 0.0008 (0.0048) model time 0.5768 (0.5733) loss 6.0669 (7.4359) grad_norm 1.9328 (2.3674) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:59:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][170/625] eta 0:04:29 lr 0.000511 wd 0.0500 time 0.5722 (0.5931) data time 0.0007 (0.0046) model time 0.5716 (0.5734) loss 5.6587 (7.4309) grad_norm 4.3494 (2.3846) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:59:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][180/625] eta 0:04:23 lr 0.000511 wd 0.0500 time 0.5744 (0.5921) data time 0.0008 (0.0044) model time 0.5736 (0.5734) loss 7.2802 (7.4187) grad_norm 2.3844 (2.4118) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:59:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][190/625] eta 0:04:17 lr 0.000511 wd 0.0500 time 0.5751 (0.5912) data time 0.0008 (0.0042) model time 0.5743 (0.5735) loss 8.9694 (7.4100) grad_norm 2.2733 (2.4161) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:59:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][200/625] eta 0:04:10 lr 0.000511 wd 0.0500 time 0.5749 (0.5903) data time 0.0006 (0.0041) model time 0.5743 (0.5733) loss 8.2306 (7.4161) grad_norm 2.2319 (2.4029) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:59:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][210/625] eta 0:04:04 lr 0.000511 wd 0.0500 time 0.5777 (0.5903) data time 0.0007 (0.0039) model time 0.5770 (0.5744) loss 7.8350 (7.4115) grad_norm 2.1798 (2.4156) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 07:59:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][220/625] eta 0:03:59 lr 0.000511 wd 0.0500 time 0.7372 (0.5906) data time 0.0006 (0.0038) model time 0.7366 (0.5755) loss 6.9782 (7.4350) grad_norm 2.5371 (2.4062) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:00:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][230/625] eta 0:03:53 lr 0.000511 wd 0.0500 time 0.7027 (0.5917) data time 0.0007 (0.0037) model time 0.7020 (0.5777) loss 6.5275 (7.4199) grad_norm 2.1537 (2.3964) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:00:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][240/625] eta 0:03:48 lr 0.000511 wd 0.0500 time 0.5735 (0.5944) data time 0.0006 (0.0035) model time 0.5729 (0.5819) loss 8.3269 (7.4203) grad_norm 3.0643 (2.3826) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:00:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][250/625] eta 0:03:43 lr 0.000510 wd 0.0500 time 0.5739 (0.5957) data time 0.0006 (0.0034) model time 0.5733 (0.5842) loss 7.7266 (7.4197) grad_norm 3.2218 (2.3807) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:00:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][260/625] eta 0:03:37 lr 0.000510 wd 0.0500 time 0.6827 (0.5967) data time 0.0008 (0.0033) model time 0.6819 (0.5858) loss 8.3359 (7.4299) grad_norm 2.3548 (2.3967) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:00:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][270/625] eta 0:03:31 lr 0.000510 wd 0.0500 time 0.5895 (0.5960) data time 0.0006 (0.0032) model time 0.5889 (0.5855) loss 8.3263 (7.4280) grad_norm 2.0509 (2.3961) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:00:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][280/625] eta 0:03:25 lr 0.000510 wd 0.0500 time 0.5757 (0.5952) data time 0.0009 (0.0032) model time 0.5748 (0.5850) loss 7.1612 (7.4093) grad_norm 2.1178 (2.4059) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:00:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][290/625] eta 0:03:19 lr 0.000510 wd 0.0500 time 0.5741 (0.5946) data time 0.0006 (0.0031) model time 0.5735 (0.5845) loss 8.1883 (7.4154) grad_norm 2.3141 (2.4351) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:00:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][300/625] eta 0:03:13 lr 0.000510 wd 0.0500 time 0.5733 (0.5939) data time 0.0008 (0.0030) model time 0.5725 (0.5841) loss 7.6671 (7.4171) grad_norm 2.5680 (2.4488) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:00:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][310/625] eta 0:03:06 lr 0.000510 wd 0.0500 time 0.5769 (0.5933) data time 0.0006 (0.0029) model time 0.5763 (0.5837) loss 8.2542 (7.4249) grad_norm 1.6866 (2.4510) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:00:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][320/625] eta 0:03:00 lr 0.000510 wd 0.0500 time 0.5729 (0.5926) data time 0.0010 (0.0029) model time 0.5720 (0.5833) loss 6.3383 (7.4117) grad_norm 1.9821 (2.4431) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:01:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][330/625] eta 0:02:54 lr 0.000510 wd 0.0500 time 0.5712 (0.5921) data time 0.0006 (0.0028) model time 0.5705 (0.5829) loss 6.5262 (7.4021) grad_norm 2.0656 (2.4390) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:01:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][340/625] eta 0:02:48 lr 0.000509 wd 0.0500 time 0.5767 (0.5916) data time 0.0006 (0.0027) model time 0.5761 (0.5826) loss 7.0621 (7.3935) grad_norm 2.9663 (2.4377) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:01:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][350/625] eta 0:02:42 lr 0.000509 wd 0.0500 time 0.5745 (0.5911) data time 0.0008 (0.0027) model time 0.5737 (0.5823) loss 8.9866 (7.3867) grad_norm 1.9507 (2.4393) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:01:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][360/625] eta 0:02:36 lr 0.000509 wd 0.0500 time 0.5751 (0.5906) data time 0.0006 (0.0026) model time 0.5746 (0.5820) loss 6.9017 (7.3921) grad_norm 2.8623 (2.4386) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:01:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][370/625] eta 0:02:30 lr 0.000509 wd 0.0500 time 0.5769 (0.5902) data time 0.0006 (0.0026) model time 0.5763 (0.5818) loss 6.6345 (7.3913) grad_norm 1.8598 (2.4304) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:01:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][380/625] eta 0:02:24 lr 0.000509 wd 0.0500 time 0.5729 (0.5898) data time 0.0007 (0.0025) model time 0.5722 (0.5816) loss 6.6383 (7.3952) grad_norm 1.5903 (2.4224) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:01:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][390/625] eta 0:02:18 lr 0.000509 wd 0.0500 time 0.5730 (0.5895) data time 0.0008 (0.0025) model time 0.5722 (0.5814) loss 6.3761 (7.3920) grad_norm 1.4329 (2.4137) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:01:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][400/625] eta 0:02:12 lr 0.000509 wd 0.0500 time 0.5760 (0.5891) data time 0.0008 (0.0024) model time 0.5752 (0.5812) loss 6.3425 (7.3850) grad_norm 2.4882 (2.4165) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:01:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][410/625] eta 0:02:06 lr 0.000509 wd 0.0500 time 0.5743 (0.5887) data time 0.0010 (0.0024) model time 0.5733 (0.5810) loss 6.5017 (7.3777) grad_norm 2.0258 (2.4241) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:01:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][420/625] eta 0:02:00 lr 0.000509 wd 0.0500 time 0.5808 (0.5884) data time 0.0006 (0.0024) model time 0.5802 (0.5808) loss 8.2081 (7.3794) grad_norm 1.9357 (2.4237) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:01:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][430/625] eta 0:01:54 lr 0.000509 wd 0.0500 time 0.5753 (0.5883) data time 0.0008 (0.0023) model time 0.5745 (0.5808) loss 8.0896 (7.3894) grad_norm 1.8948 (2.4212) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:02:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][440/625] eta 0:01:48 lr 0.000508 wd 0.0500 time 0.5786 (0.5880) data time 0.0006 (0.0023) model time 0.5779 (0.5806) loss 9.2482 (7.3924) grad_norm 4.2632 (2.4299) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:02:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][450/625] eta 0:01:43 lr 0.000508 wd 0.0500 time 0.7654 (0.5888) data time 0.0008 (0.0023) model time 0.7646 (0.5817) loss 6.6837 (7.3896) grad_norm 2.6032 (2.4446) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:02:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][460/625] eta 0:01:37 lr 0.000508 wd 0.0500 time 0.7071 (0.5900) data time 0.0008 (0.0022) model time 0.7062 (0.5832) loss 8.0783 (7.3902) grad_norm 2.8881 (2.4442) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:02:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][470/625] eta 0:01:31 lr 0.000508 wd 0.0500 time 0.7240 (0.5910) data time 0.0006 (0.0022) model time 0.7234 (0.5845) loss 5.7147 (7.3797) grad_norm 2.6289 (2.4409) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:02:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][480/625] eta 0:01:25 lr 0.000508 wd 0.0500 time 0.5686 (0.5917) data time 0.0008 (0.0022) model time 0.5678 (0.5854) loss 7.8640 (7.3832) grad_norm 2.1209 (2.4351) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:02:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][490/625] eta 0:01:19 lr 0.000508 wd 0.0500 time 0.5719 (0.5914) data time 0.0008 (0.0022) model time 0.5711 (0.5852) loss 8.3746 (7.3857) grad_norm 3.6081 (2.4318) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:02:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][500/625] eta 0:01:13 lr 0.000508 wd 0.0500 time 0.5744 (0.5910) data time 0.0008 (0.0021) model time 0.5737 (0.5849) loss 7.3526 (7.3868) grad_norm 1.6799 (2.4320) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:02:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][510/625] eta 0:01:07 lr 0.000508 wd 0.0500 time 0.5743 (0.5907) data time 0.0007 (0.0021) model time 0.5736 (0.5847) loss 7.3844 (7.3861) grad_norm 1.7228 (2.4292) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:02:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][520/625] eta 0:01:01 lr 0.000508 wd 0.0500 time 0.5761 (0.5904) data time 0.0007 (0.0021) model time 0.5754 (0.5844) loss 7.1494 (7.3850) grad_norm 2.0219 (2.4298) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:02:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][530/625] eta 0:00:56 lr 0.000508 wd 0.0500 time 0.5731 (0.5901) data time 0.0006 (0.0021) model time 0.5725 (0.5842) loss 7.9640 (7.3891) grad_norm 2.5708 (2.4352) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:03:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][540/625] eta 0:00:50 lr 0.000507 wd 0.0500 time 0.5736 (0.5898) data time 0.0006 (0.0020) model time 0.5730 (0.5840) loss 6.3289 (7.3904) grad_norm 2.1258 (2.4376) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:03:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][550/625] eta 0:00:44 lr 0.000507 wd 0.0500 time 0.5736 (0.5895) data time 0.0008 (0.0020) model time 0.5728 (0.5838) loss 7.3365 (7.3913) grad_norm 2.1648 (2.4598) loss_scale 1024.0000 (519.4338) mem 22339MB +[2024-07-25 08:03:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][560/625] eta 0:00:38 lr 0.000507 wd 0.0500 time 0.5778 (0.5892) data time 0.0006 (0.0020) model time 0.5771 (0.5836) loss 7.3632 (7.3998) grad_norm 1.7081 (2.4586) loss_scale 1024.0000 (528.4278) mem 22339MB +[2024-07-25 08:03:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][570/625] eta 0:00:32 lr 0.000507 wd 0.0500 time 0.5726 (0.5890) data time 0.0006 (0.0020) model time 0.5720 (0.5834) loss 6.6533 (7.3963) grad_norm 2.4230 (2.4573) loss_scale 1024.0000 (537.1068) mem 22339MB +[2024-07-25 08:03:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][580/625] eta 0:00:26 lr 0.000507 wd 0.0500 time 0.5745 (0.5887) data time 0.0006 (0.0019) model time 0.5739 (0.5832) loss 8.1177 (7.4040) grad_norm 2.0229 (2.4534) loss_scale 1024.0000 (545.4871) mem 22339MB +[2024-07-25 08:03:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][590/625] eta 0:00:20 lr 0.000507 wd 0.0500 time 0.5744 (0.5885) data time 0.0006 (0.0019) model time 0.5738 (0.5830) loss 5.3659 (7.4043) grad_norm 2.4090 (2.4516) loss_scale 1024.0000 (553.5838) mem 22339MB +[2024-07-25 08:03:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][600/625] eta 0:00:14 lr 0.000507 wd 0.0500 time 0.5777 (0.5883) data time 0.0008 (0.0019) model time 0.5769 (0.5828) loss 8.2147 (7.4082) grad_norm 2.0298 (2.4660) loss_scale 1024.0000 (561.4110) mem 22339MB +[2024-07-25 08:03:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][610/625] eta 0:00:08 lr 0.000507 wd 0.0500 time 0.5749 (0.5880) data time 0.0004 (0.0019) model time 0.5745 (0.5827) loss 6.3554 (7.4087) grad_norm 3.5024 (2.4672) loss_scale 1024.0000 (568.9820) mem 22339MB +[2024-07-25 08:03:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [174/300][620/625] eta 0:00:02 lr 0.000507 wd 0.0500 time 0.5791 (0.5878) data time 0.0006 (0.0019) model time 0.5785 (0.5825) loss 8.2435 (7.4163) grad_norm 2.2845 (2.4743) loss_scale 1024.0000 (576.3092) mem 22339MB +[2024-07-25 08:03:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 174 training takes 0:06:07 +[2024-07-25 08:03:53 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 08:03:55 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 08:03:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.474 (0.474) Loss 0.5107 (0.5107) Acc@1 89.453 (89.453) Acc@5 98.682 (98.682) Mem 22339MB +[2024-07-25 08:03:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.157) Loss 0.7935 (0.6352) Acc@1 82.324 (86.599) Acc@5 96.191 (97.763) Mem 22339MB +[2024-07-25 08:03:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9111 (0.7375) Acc@1 78.174 (83.608) Acc@5 95.166 (96.768) Mem 22339MB +[2024-07-25 08:03:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.223 Acc@5 96.731 +[2024-07-25 08:03:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.2% +[2024-07-25 08:03:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.817 (0.817) Loss 0.4978 (0.4978) Acc@1 89.893 (89.893) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-25 08:04:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.188) Loss 0.7607 (0.6226) Acc@1 82.422 (86.932) Acc@5 96.533 (97.878) Mem 22339MB +[2024-07-25 08:04:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.159) Loss 0.8833 (0.7204) Acc@1 78.613 (83.915) Acc@5 95.898 (96.922) Mem 22339MB +[2024-07-25 08:04:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.513 Acc@5 96.921 +[2024-07-25 08:04:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.5% +[2024-07-25 08:04:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.51% +[2024-07-25 08:04:02 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 08:04:04 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 08:04:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][0/625] eta 0:09:42 lr 0.000507 wd 0.0500 time 0.9317 (0.9317) data time 0.4127 (0.4127) model time 0.0000 (0.0000) loss 5.5265 (5.5265) grad_norm 2.6902 (2.6902) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:04:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][10/625] eta 0:06:12 lr 0.000506 wd 0.0500 time 0.5737 (0.6059) data time 0.0006 (0.0382) model time 0.0000 (0.0000) loss 7.2235 (7.1867) grad_norm 2.1468 (2.2235) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:04:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][20/625] eta 0:06:01 lr 0.000506 wd 0.0500 time 0.5730 (0.5973) data time 0.0008 (0.0204) model time 0.0000 (0.0000) loss 7.8616 (7.3205) grad_norm 2.4681 (2.2149) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:04:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][30/625] eta 0:05:52 lr 0.000506 wd 0.0500 time 0.5720 (0.5921) data time 0.0006 (0.0142) model time 0.0000 (0.0000) loss 6.7468 (7.4041) grad_norm 1.9314 (2.4622) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:04:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][40/625] eta 0:05:48 lr 0.000506 wd 0.0500 time 0.7198 (0.5952) data time 0.0006 (0.0109) model time 0.0000 (0.0000) loss 6.7516 (7.4757) grad_norm 1.9736 (2.5012) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:04:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][50/625] eta 0:05:47 lr 0.000506 wd 0.0500 time 0.7478 (0.6037) data time 0.0007 (0.0089) model time 0.0000 (0.0000) loss 5.3863 (7.3569) grad_norm 3.0493 (2.6650) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:04:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][60/625] eta 0:05:40 lr 0.000506 wd 0.0500 time 0.6256 (0.6033) data time 0.0007 (0.0076) model time 0.6249 (0.6002) loss 7.6547 (7.4317) grad_norm 2.3480 (2.9927) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:04:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][70/625] eta 0:05:36 lr 0.000506 wd 0.0500 time 0.5705 (0.6059) data time 0.0006 (0.0067) model time 0.5698 (0.6105) loss 6.7846 (7.3810) grad_norm 2.3397 (3.0501) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:04:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][80/625] eta 0:05:29 lr 0.000506 wd 0.0500 time 0.5736 (0.6048) data time 0.0006 (0.0059) model time 0.5730 (0.6058) loss 8.1723 (7.3609) grad_norm 3.3682 (3.0498) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:04:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][90/625] eta 0:05:22 lr 0.000506 wd 0.0500 time 0.5706 (0.6030) data time 0.0006 (0.0054) model time 0.5700 (0.6011) loss 7.3225 (7.3090) grad_norm 3.7191 (3.0804) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:05:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][100/625] eta 0:05:15 lr 0.000505 wd 0.0500 time 0.5769 (0.6002) data time 0.0010 (0.0049) model time 0.5759 (0.5957) loss 7.7929 (7.3449) grad_norm 2.2458 (3.0108) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:05:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][110/625] eta 0:05:07 lr 0.000505 wd 0.0500 time 0.5694 (0.5978) data time 0.0009 (0.0046) model time 0.5685 (0.5919) loss 8.5762 (7.3493) grad_norm 1.7974 (2.9503) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:05:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][120/625] eta 0:05:00 lr 0.000505 wd 0.0500 time 0.5707 (0.5959) data time 0.0006 (0.0042) model time 0.5701 (0.5893) loss 5.9895 (7.3543) grad_norm 1.5899 (2.8619) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:05:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][130/625] eta 0:04:54 lr 0.000505 wd 0.0500 time 0.5695 (0.5942) data time 0.0007 (0.0040) model time 0.5688 (0.5873) loss 8.9464 (7.3782) grad_norm 2.6895 (2.8721) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:05:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][140/625] eta 0:04:47 lr 0.000505 wd 0.0500 time 0.5675 (0.5927) data time 0.0008 (0.0037) model time 0.5667 (0.5856) loss 6.7480 (7.3443) grad_norm 2.7945 (2.8612) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:05:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][150/625] eta 0:04:40 lr 0.000505 wd 0.0500 time 0.5736 (0.5913) data time 0.0008 (0.0036) model time 0.5727 (0.5842) loss 8.0236 (7.3706) grad_norm 5.3063 (2.8483) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:05:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][160/625] eta 0:04:34 lr 0.000505 wd 0.0500 time 0.5726 (0.5902) data time 0.0008 (0.0034) model time 0.5718 (0.5831) loss 6.5404 (7.3857) grad_norm 1.9502 (2.8399) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:05:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][170/625] eta 0:04:28 lr 0.000505 wd 0.0500 time 0.5643 (0.5893) data time 0.0006 (0.0032) model time 0.5637 (0.5823) loss 6.1930 (7.3907) grad_norm 2.8303 (2.8138) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:05:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][180/625] eta 0:04:21 lr 0.000505 wd 0.0500 time 0.5740 (0.5885) data time 0.0008 (0.0031) model time 0.5732 (0.5818) loss 5.4826 (7.3730) grad_norm 4.3382 (2.8611) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:05:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][190/625] eta 0:04:15 lr 0.000505 wd 0.0500 time 0.5717 (0.5878) data time 0.0006 (0.0030) model time 0.5711 (0.5811) loss 6.8595 (7.3857) grad_norm 3.5418 (2.8474) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:06:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][200/625] eta 0:04:09 lr 0.000504 wd 0.0500 time 0.5651 (0.5871) data time 0.0008 (0.0029) model time 0.5643 (0.5807) loss 8.0520 (7.3875) grad_norm 3.1059 (2.8392) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:06:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][210/625] eta 0:04:03 lr 0.000504 wd 0.0500 time 0.5721 (0.5865) data time 0.0008 (0.0028) model time 0.5713 (0.5802) loss 8.8060 (7.3848) grad_norm 2.2185 (2.8301) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:06:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][220/625] eta 0:03:57 lr 0.000504 wd 0.0500 time 0.5726 (0.5860) data time 0.0008 (0.0027) model time 0.5719 (0.5799) loss 8.3843 (7.3859) grad_norm 1.7705 (2.8023) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:06:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][230/625] eta 0:03:51 lr 0.000504 wd 0.0500 time 0.5719 (0.5855) data time 0.0008 (0.0026) model time 0.5711 (0.5796) loss 8.7878 (7.3896) grad_norm 1.9973 (2.7682) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:06:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][240/625] eta 0:03:45 lr 0.000504 wd 0.0500 time 0.5728 (0.5853) data time 0.0006 (0.0025) model time 0.5722 (0.5795) loss 8.6248 (7.3922) grad_norm 2.3495 (2.7394) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:06:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][250/625] eta 0:03:39 lr 0.000504 wd 0.0500 time 0.5738 (0.5855) data time 0.0009 (0.0025) model time 0.5729 (0.5800) loss 6.2201 (7.3846) grad_norm 2.3380 (2.7377) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:06:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][260/625] eta 0:03:33 lr 0.000504 wd 0.0500 time 0.5644 (0.5854) data time 0.0006 (0.0024) model time 0.5638 (0.5801) loss 5.3769 (7.3673) grad_norm 2.4199 (2.7230) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:06:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][270/625] eta 0:03:28 lr 0.000504 wd 0.0500 time 0.7481 (0.5872) data time 0.0009 (0.0023) model time 0.7472 (0.5826) loss 5.7563 (7.3552) grad_norm 1.9218 (2.7115) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:06:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][280/625] eta 0:03:22 lr 0.000504 wd 0.0500 time 0.5722 (0.5873) data time 0.0006 (0.0023) model time 0.5715 (0.5829) loss 7.4382 (7.3665) grad_norm 1.7270 (2.6863) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:06:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][290/625] eta 0:03:17 lr 0.000503 wd 0.0500 time 0.5734 (0.5881) data time 0.0007 (0.0022) model time 0.5727 (0.5840) loss 9.0948 (7.3898) grad_norm 1.8995 (2.6608) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:07:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][300/625] eta 0:03:11 lr 0.000503 wd 0.0500 time 0.5727 (0.5884) data time 0.0006 (0.0022) model time 0.5721 (0.5845) loss 7.2909 (7.3816) grad_norm 2.3091 (2.6389) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:07:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][310/625] eta 0:03:05 lr 0.000503 wd 0.0500 time 0.5748 (0.5880) data time 0.0006 (0.0021) model time 0.5742 (0.5842) loss 6.6910 (7.3726) grad_norm 2.8025 (2.6239) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:07:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][320/625] eta 0:02:59 lr 0.000503 wd 0.0500 time 0.5706 (0.5877) data time 0.0006 (0.0021) model time 0.5700 (0.5838) loss 7.4015 (7.3816) grad_norm 1.8225 (2.6130) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:07:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][330/625] eta 0:02:53 lr 0.000503 wd 0.0500 time 0.5702 (0.5872) data time 0.0006 (0.0021) model time 0.5696 (0.5834) loss 6.3105 (7.3736) grad_norm 2.8261 (2.5923) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:07:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][340/625] eta 0:02:47 lr 0.000503 wd 0.0500 time 0.5729 (0.5869) data time 0.0006 (0.0020) model time 0.5723 (0.5831) loss 6.9648 (7.3701) grad_norm 2.7907 (2.5795) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:07:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][350/625] eta 0:02:41 lr 0.000503 wd 0.0500 time 0.5715 (0.5865) data time 0.0007 (0.0020) model time 0.5707 (0.5828) loss 6.7383 (7.3643) grad_norm 1.8106 (2.5626) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:07:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][360/625] eta 0:02:35 lr 0.000503 wd 0.0500 time 0.5715 (0.5862) data time 0.0006 (0.0020) model time 0.5709 (0.5825) loss 9.0475 (7.3634) grad_norm 2.6165 (2.5707) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:07:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][370/625] eta 0:02:29 lr 0.000503 wd 0.0500 time 0.5732 (0.5858) data time 0.0008 (0.0019) model time 0.5724 (0.5822) loss 7.0771 (7.3699) grad_norm 3.0409 (2.5674) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:07:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][380/625] eta 0:02:23 lr 0.000503 wd 0.0500 time 0.5719 (0.5855) data time 0.0007 (0.0019) model time 0.5712 (0.5819) loss 8.3849 (7.3684) grad_norm 3.2548 (2.5681) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:07:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][390/625] eta 0:02:17 lr 0.000502 wd 0.0500 time 0.5735 (0.5852) data time 0.0008 (0.0019) model time 0.5728 (0.5816) loss 7.7442 (7.3733) grad_norm 2.6817 (2.5636) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:07:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][400/625] eta 0:02:11 lr 0.000502 wd 0.0500 time 0.5749 (0.5850) data time 0.0006 (0.0018) model time 0.5743 (0.5814) loss 8.4655 (7.3775) grad_norm 2.4479 (2.5603) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:08:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][410/625] eta 0:02:05 lr 0.000502 wd 0.0500 time 0.5725 (0.5847) data time 0.0007 (0.0018) model time 0.5718 (0.5812) loss 6.3199 (7.3786) grad_norm 1.7881 (2.5709) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:08:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][420/625] eta 0:01:59 lr 0.000502 wd 0.0500 time 0.5743 (0.5845) data time 0.0006 (0.0018) model time 0.5737 (0.5810) loss 7.2704 (7.3773) grad_norm 2.8056 (2.5638) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:08:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][430/625] eta 0:01:53 lr 0.000502 wd 0.0500 time 0.5726 (0.5843) data time 0.0007 (0.0018) model time 0.5718 (0.5808) loss 7.8852 (7.3737) grad_norm 2.6631 (2.5611) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:08:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][440/625] eta 0:01:48 lr 0.000502 wd 0.0500 time 0.5747 (0.5841) data time 0.0006 (0.0017) model time 0.5741 (0.5807) loss 6.2803 (7.3702) grad_norm 3.3224 (2.5576) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:08:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][450/625] eta 0:01:42 lr 0.000502 wd 0.0500 time 0.5754 (0.5839) data time 0.0006 (0.0017) model time 0.5748 (0.5805) loss 7.1509 (7.3762) grad_norm 2.3045 (2.5517) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:08:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][460/625] eta 0:01:36 lr 0.000502 wd 0.0500 time 0.5738 (0.5839) data time 0.0007 (0.0017) model time 0.5731 (0.5806) loss 6.6971 (7.3794) grad_norm 2.8353 (2.5480) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:08:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][470/625] eta 0:01:30 lr 0.000502 wd 0.0500 time 0.5718 (0.5838) data time 0.0007 (0.0017) model time 0.5711 (0.5806) loss 7.5441 (7.3957) grad_norm 1.9828 (2.5450) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:08:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][480/625] eta 0:01:24 lr 0.000501 wd 0.0500 time 0.5662 (0.5841) data time 0.0008 (0.0017) model time 0.5654 (0.5809) loss 6.0946 (7.3994) grad_norm 1.9893 (2.5363) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:08:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][490/625] eta 0:01:19 lr 0.000501 wd 0.0500 time 0.7647 (0.5854) data time 0.0006 (0.0017) model time 0.7641 (0.5825) loss 6.3164 (7.4059) grad_norm 2.0520 (2.5254) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:08:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][500/625] eta 0:01:13 lr 0.000501 wd 0.0500 time 0.5750 (0.5861) data time 0.0006 (0.0016) model time 0.5744 (0.5833) loss 7.8649 (7.4024) grad_norm 1.7003 (2.5113) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:09:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][510/625] eta 0:01:07 lr 0.000501 wd 0.0500 time 0.6915 (0.5869) data time 0.0008 (0.0016) model time 0.6907 (0.5842) loss 6.5968 (7.4027) grad_norm 1.6988 (2.5040) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:09:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][520/625] eta 0:01:01 lr 0.000501 wd 0.0500 time 0.5718 (0.5869) data time 0.0007 (0.0016) model time 0.5710 (0.5843) loss 6.1033 (7.3886) grad_norm 2.2774 (2.5051) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:09:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][530/625] eta 0:00:55 lr 0.000501 wd 0.0500 time 0.5757 (0.5868) data time 0.0008 (0.0016) model time 0.5749 (0.5841) loss 7.8029 (7.3893) grad_norm 2.2320 (2.5114) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:09:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][540/625] eta 0:00:49 lr 0.000501 wd 0.0500 time 0.5647 (0.5866) data time 0.0006 (0.0016) model time 0.5641 (0.5840) loss 6.7711 (7.3813) grad_norm 1.8081 (2.5106) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:09:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][550/625] eta 0:00:43 lr 0.000501 wd 0.0500 time 0.5735 (0.5863) data time 0.0006 (0.0016) model time 0.5729 (0.5838) loss 6.9407 (7.3803) grad_norm 2.4532 (2.5186) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:09:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][560/625] eta 0:00:38 lr 0.000501 wd 0.0500 time 0.5750 (0.5861) data time 0.0007 (0.0015) model time 0.5744 (0.5836) loss 7.2588 (7.3832) grad_norm 3.9274 (2.5394) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:09:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][570/625] eta 0:00:32 lr 0.000501 wd 0.0500 time 0.5740 (0.5859) data time 0.0009 (0.0015) model time 0.5731 (0.5834) loss 7.3363 (7.3787) grad_norm 2.3793 (2.5423) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:09:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][580/625] eta 0:00:26 lr 0.000500 wd 0.0500 time 0.5719 (0.5858) data time 0.0008 (0.0015) model time 0.5711 (0.5832) loss 7.6547 (7.3826) grad_norm 1.9100 (2.5393) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:09:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][590/625] eta 0:00:20 lr 0.000500 wd 0.0500 time 0.5731 (0.5856) data time 0.0006 (0.0015) model time 0.5725 (0.5831) loss 6.8588 (7.3776) grad_norm 2.0659 (2.5353) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:09:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][600/625] eta 0:00:14 lr 0.000500 wd 0.0500 time 0.5739 (0.5854) data time 0.0009 (0.0015) model time 0.5729 (0.5829) loss 7.3557 (7.3792) grad_norm 1.7920 (2.5255) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:10:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][610/625] eta 0:00:08 lr 0.000500 wd 0.0500 time 0.5718 (0.5853) data time 0.0005 (0.0015) model time 0.5712 (0.5828) loss 8.4891 (7.3776) grad_norm 1.9626 (2.5174) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:10:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [175/300][620/625] eta 0:00:02 lr 0.000500 wd 0.0500 time 0.5748 (0.5851) data time 0.0004 (0.0015) model time 0.5745 (0.5826) loss 6.1277 (7.3778) grad_norm 2.7850 (2.5176) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:10:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 175 training takes 0:06:05 +[2024-07-25 08:10:09 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 08:10:11 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 08:10:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.481 (0.481) Loss 0.5024 (0.5024) Acc@1 89.990 (89.990) Acc@5 98.779 (98.779) Mem 22339MB +[2024-07-25 08:10:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.7827 (0.6383) Acc@1 81.787 (86.648) Acc@5 96.533 (97.856) Mem 22339MB +[2024-07-25 08:10:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.143) Loss 0.8950 (0.7466) Acc@1 78.564 (83.503) Acc@5 95.947 (96.733) Mem 22339MB +[2024-07-25 08:10:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.215 Acc@5 96.721 +[2024-07-25 08:10:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.2% +[2024-07-25 08:10:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.846 (0.846) Loss 0.4976 (0.4976) Acc@1 89.990 (89.990) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 08:10:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.191) Loss 0.7593 (0.6222) Acc@1 82.422 (86.945) Acc@5 96.484 (97.865) Mem 22339MB +[2024-07-25 08:10:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.160) Loss 0.8823 (0.7198) Acc@1 78.613 (83.938) Acc@5 95.752 (96.912) Mem 22339MB +[2024-07-25 08:10:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.533 Acc@5 96.905 +[2024-07-25 08:10:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.5% +[2024-07-25 08:10:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.53% +[2024-07-25 08:10:18 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 08:10:20 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 08:10:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][0/625] eta 0:09:22 lr 0.000500 wd 0.0500 time 0.9000 (0.9000) data time 0.3816 (0.3816) model time 0.0000 (0.0000) loss 7.5021 (7.5021) grad_norm 2.6965 (2.6965) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:10:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][10/625] eta 0:06:10 lr 0.000500 wd 0.0500 time 0.5707 (0.6019) data time 0.0006 (0.0354) model time 0.0000 (0.0000) loss 6.6713 (7.0858) grad_norm 2.0828 (2.9940) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:10:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][20/625] eta 0:05:56 lr 0.000500 wd 0.0500 time 0.5709 (0.5889) data time 0.0009 (0.0189) model time 0.0000 (0.0000) loss 6.9805 (7.3072) grad_norm 2.2760 (2.8806) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:10:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][30/625] eta 0:05:47 lr 0.000500 wd 0.0500 time 0.5707 (0.5839) data time 0.0008 (0.0131) model time 0.0000 (0.0000) loss 7.7419 (7.2691) grad_norm 2.2619 (2.8855) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:10:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][40/625] eta 0:05:39 lr 0.000500 wd 0.0500 time 0.5698 (0.5812) data time 0.0008 (0.0101) model time 0.0000 (0.0000) loss 6.8455 (7.3107) grad_norm 6.8036 (2.9178) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:10:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][50/625] eta 0:05:33 lr 0.000499 wd 0.0500 time 0.5736 (0.5800) data time 0.0008 (0.0083) model time 0.0000 (0.0000) loss 6.0845 (7.2610) grad_norm 1.9642 (2.8362) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:10:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][60/625] eta 0:05:27 lr 0.000499 wd 0.0500 time 0.5684 (0.5790) data time 0.0006 (0.0071) model time 0.5678 (0.5730) loss 7.2489 (7.3131) grad_norm 2.0099 (2.7354) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:11:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][70/625] eta 0:05:21 lr 0.000499 wd 0.0500 time 0.5738 (0.5789) data time 0.0006 (0.0062) model time 0.5732 (0.5753) loss 7.1948 (7.3833) grad_norm 1.8209 (2.7117) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:11:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][80/625] eta 0:05:16 lr 0.000499 wd 0.0500 time 0.6066 (0.5816) data time 0.0008 (0.0055) model time 0.6058 (0.5835) loss 8.5996 (7.3889) grad_norm 2.0657 (2.6649) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:11:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][90/625] eta 0:05:15 lr 0.000499 wd 0.0500 time 0.7689 (0.5902) data time 0.0009 (0.0050) model time 0.7680 (0.6023) loss 8.6448 (7.4226) grad_norm 1.5568 (2.6026) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:11:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][100/625] eta 0:05:11 lr 0.000499 wd 0.0500 time 0.7165 (0.5925) data time 0.0006 (0.0046) model time 0.7159 (0.6045) loss 7.5717 (7.4132) grad_norm 3.2072 (2.5704) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:11:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][110/625] eta 0:05:07 lr 0.000499 wd 0.0500 time 0.7627 (0.5979) data time 0.0009 (0.0043) model time 0.7618 (0.6122) loss 7.9360 (7.4066) grad_norm 2.8363 (2.5625) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:11:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][120/625] eta 0:05:01 lr 0.000499 wd 0.0500 time 0.6945 (0.5973) data time 0.0006 (0.0040) model time 0.6939 (0.6090) loss 7.0528 (7.3890) grad_norm 1.9563 (2.5468) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:11:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][130/625] eta 0:04:54 lr 0.000499 wd 0.0500 time 0.5720 (0.5955) data time 0.0007 (0.0038) model time 0.5713 (0.6046) loss 7.2588 (7.3957) grad_norm 3.0098 (2.5614) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:11:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][140/625] eta 0:04:48 lr 0.000498 wd 0.0500 time 0.5734 (0.5941) data time 0.0006 (0.0035) model time 0.5727 (0.6013) loss 7.5186 (7.4048) grad_norm 1.9622 (2.5525) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:11:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][150/625] eta 0:04:41 lr 0.000498 wd 0.0500 time 0.5723 (0.5929) data time 0.0008 (0.0034) model time 0.5715 (0.5986) loss 8.1688 (7.4146) grad_norm 2.2124 (2.5432) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:11:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][160/625] eta 0:04:35 lr 0.000498 wd 0.0500 time 0.5746 (0.5917) data time 0.0008 (0.0032) model time 0.5738 (0.5963) loss 5.9870 (7.3859) grad_norm 3.6036 (2.5303) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:12:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][170/625] eta 0:04:28 lr 0.000498 wd 0.0500 time 0.5733 (0.5907) data time 0.0008 (0.0031) model time 0.5724 (0.5944) loss 7.3884 (7.3551) grad_norm 2.4420 (2.5278) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:12:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][180/625] eta 0:04:22 lr 0.000498 wd 0.0500 time 0.5734 (0.5898) data time 0.0006 (0.0029) model time 0.5728 (0.5928) loss 7.2674 (7.3718) grad_norm 2.7721 (2.5373) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:12:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][190/625] eta 0:04:16 lr 0.000498 wd 0.0500 time 0.5725 (0.5890) data time 0.0007 (0.0028) model time 0.5718 (0.5914) loss 9.0690 (7.3792) grad_norm 1.6567 (2.5303) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:12:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][200/625] eta 0:04:10 lr 0.000498 wd 0.0500 time 0.5711 (0.5884) data time 0.0006 (0.0027) model time 0.5705 (0.5904) loss 7.2052 (7.3825) grad_norm 2.3911 (2.5255) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:12:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][210/625] eta 0:04:03 lr 0.000498 wd 0.0500 time 0.5746 (0.5878) data time 0.0006 (0.0026) model time 0.5740 (0.5894) loss 6.5042 (7.3756) grad_norm 3.4460 (2.5321) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:12:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][220/625] eta 0:03:57 lr 0.000498 wd 0.0500 time 0.5725 (0.5874) data time 0.0007 (0.0025) model time 0.5718 (0.5888) loss 7.0536 (7.3721) grad_norm 1.9972 (2.5278) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:12:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][230/625] eta 0:03:52 lr 0.000498 wd 0.0500 time 0.5707 (0.5875) data time 0.0008 (0.0025) model time 0.5699 (0.5888) loss 7.3829 (7.3655) grad_norm 3.2483 (2.5213) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:12:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][240/625] eta 0:03:45 lr 0.000497 wd 0.0500 time 0.5696 (0.5869) data time 0.0006 (0.0024) model time 0.5690 (0.5879) loss 6.0270 (7.3648) grad_norm 1.8037 (2.5244) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:12:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][250/625] eta 0:03:39 lr 0.000497 wd 0.0500 time 0.5703 (0.5863) data time 0.0008 (0.0023) model time 0.5695 (0.5871) loss 5.6302 (7.3683) grad_norm 1.8805 (2.5083) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:12:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][260/625] eta 0:03:33 lr 0.000497 wd 0.0500 time 0.5722 (0.5859) data time 0.0008 (0.0023) model time 0.5714 (0.5865) loss 6.4598 (7.3629) grad_norm 2.3876 (2.5075) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:12:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][270/625] eta 0:03:27 lr 0.000497 wd 0.0500 time 0.5751 (0.5854) data time 0.0007 (0.0022) model time 0.5745 (0.5858) loss 7.6767 (7.3562) grad_norm 3.1963 (2.5275) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:13:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][280/625] eta 0:03:21 lr 0.000497 wd 0.0500 time 0.5733 (0.5851) data time 0.0009 (0.0022) model time 0.5724 (0.5854) loss 7.8307 (7.3825) grad_norm 1.7420 (2.5234) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:13:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][290/625] eta 0:03:16 lr 0.000497 wd 0.0500 time 0.7098 (0.5854) data time 0.0006 (0.0021) model time 0.7092 (0.5857) loss 7.9338 (7.3868) grad_norm 1.6408 (2.5066) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:13:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][300/625] eta 0:03:10 lr 0.000497 wd 0.0500 time 0.7678 (0.5859) data time 0.0006 (0.0021) model time 0.7672 (0.5862) loss 8.2232 (7.3961) grad_norm 2.0900 (2.4984) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:13:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][310/625] eta 0:03:05 lr 0.000497 wd 0.0500 time 0.6933 (0.5874) data time 0.0008 (0.0020) model time 0.6926 (0.5881) loss 8.6310 (7.4095) grad_norm 2.8025 (2.4872) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:13:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][320/625] eta 0:02:59 lr 0.000497 wd 0.0500 time 0.5628 (0.5878) data time 0.0008 (0.0020) model time 0.5620 (0.5885) loss 6.3210 (7.4029) grad_norm 1.5665 (2.4841) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:13:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][330/625] eta 0:02:53 lr 0.000496 wd 0.0500 time 0.7179 (0.5895) data time 0.0008 (0.0020) model time 0.7171 (0.5904) loss 8.4484 (7.4181) grad_norm 2.1863 (2.4806) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:13:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][340/625] eta 0:02:48 lr 0.000496 wd 0.0500 time 0.6805 (0.5895) data time 0.0008 (0.0019) model time 0.6797 (0.5903) loss 9.0214 (7.4393) grad_norm 1.9686 (2.4750) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:13:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][350/625] eta 0:02:42 lr 0.000496 wd 0.0500 time 0.5770 (0.5891) data time 0.0006 (0.0019) model time 0.5764 (0.5898) loss 5.9819 (7.4281) grad_norm 1.8177 (2.4608) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:13:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][360/625] eta 0:02:35 lr 0.000496 wd 0.0500 time 0.5725 (0.5887) data time 0.0008 (0.0019) model time 0.5717 (0.5893) loss 8.7911 (7.4239) grad_norm 2.5857 (2.4507) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:13:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][370/625] eta 0:02:30 lr 0.000496 wd 0.0500 time 0.5729 (0.5883) data time 0.0006 (0.0018) model time 0.5723 (0.5888) loss 5.9333 (7.4160) grad_norm 1.4717 (2.4359) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:14:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][380/625] eta 0:02:24 lr 0.000496 wd 0.0500 time 0.5614 (0.5879) data time 0.0008 (0.0018) model time 0.5606 (0.5883) loss 7.7517 (7.4060) grad_norm 2.3395 (2.4303) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:14:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][390/625] eta 0:02:18 lr 0.000496 wd 0.0500 time 0.5699 (0.5876) data time 0.0008 (0.0018) model time 0.5691 (0.5879) loss 8.1088 (7.4134) grad_norm 2.9711 (2.4309) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:14:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][400/625] eta 0:02:12 lr 0.000496 wd 0.0500 time 0.5792 (0.5873) data time 0.0008 (0.0018) model time 0.5784 (0.5875) loss 8.0160 (7.4150) grad_norm 2.2186 (2.4384) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:14:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][410/625] eta 0:02:06 lr 0.000496 wd 0.0500 time 0.5754 (0.5870) data time 0.0009 (0.0017) model time 0.5745 (0.5872) loss 6.5138 (7.4093) grad_norm 2.7835 (2.4350) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:14:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][420/625] eta 0:02:00 lr 0.000496 wd 0.0500 time 0.5811 (0.5867) data time 0.0008 (0.0017) model time 0.5803 (0.5868) loss 8.4468 (7.4073) grad_norm 1.5138 (2.4223) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:14:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][430/625] eta 0:01:54 lr 0.000495 wd 0.0500 time 0.5724 (0.5864) data time 0.0008 (0.0017) model time 0.5716 (0.5865) loss 5.9978 (7.4050) grad_norm 2.1734 (2.4127) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:14:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][440/625] eta 0:01:48 lr 0.000495 wd 0.0500 time 0.5731 (0.5861) data time 0.0007 (0.0017) model time 0.5725 (0.5861) loss 7.1597 (7.4164) grad_norm 1.8464 (2.4040) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:14:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][450/625] eta 0:01:42 lr 0.000495 wd 0.0500 time 0.5665 (0.5860) data time 0.0009 (0.0017) model time 0.5656 (0.5860) loss 8.5544 (7.4236) grad_norm 2.7422 (2.4281) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:14:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][460/625] eta 0:01:36 lr 0.000495 wd 0.0500 time 0.5691 (0.5858) data time 0.0009 (0.0016) model time 0.5682 (0.5857) loss 5.9951 (7.4219) grad_norm 3.1953 (2.4469) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:14:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][470/625] eta 0:01:30 lr 0.000495 wd 0.0500 time 0.5736 (0.5855) data time 0.0006 (0.0016) model time 0.5730 (0.5854) loss 7.0576 (7.4227) grad_norm 2.5292 (2.4524) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:15:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][480/625] eta 0:01:24 lr 0.000495 wd 0.0500 time 0.5727 (0.5853) data time 0.0009 (0.0016) model time 0.5718 (0.5851) loss 7.7638 (7.4205) grad_norm 2.7551 (2.4652) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:15:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][490/625] eta 0:01:18 lr 0.000495 wd 0.0500 time 0.5742 (0.5851) data time 0.0006 (0.0016) model time 0.5736 (0.5849) loss 5.7099 (7.4222) grad_norm 5.2513 (2.4691) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:15:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][500/625] eta 0:01:13 lr 0.000495 wd 0.0500 time 0.5728 (0.5850) data time 0.0009 (0.0016) model time 0.5719 (0.5847) loss 9.2831 (7.4312) grad_norm 2.0641 (2.4687) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:15:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][510/625] eta 0:01:07 lr 0.000495 wd 0.0500 time 0.5748 (0.5850) data time 0.0006 (0.0016) model time 0.5741 (0.5848) loss 8.3968 (7.4363) grad_norm 2.2055 (2.4707) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:15:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][520/625] eta 0:01:01 lr 0.000494 wd 0.0500 time 0.5715 (0.5854) data time 0.0008 (0.0015) model time 0.5707 (0.5851) loss 6.7518 (7.4408) grad_norm 2.1606 (2.4720) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:15:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][530/625] eta 0:00:55 lr 0.000494 wd 0.0500 time 0.5619 (0.5865) data time 0.0009 (0.0015) model time 0.5610 (0.5863) loss 8.0742 (7.4451) grad_norm 3.7776 (2.4784) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:15:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][540/625] eta 0:00:49 lr 0.000494 wd 0.0500 time 0.7276 (0.5871) data time 0.0007 (0.0015) model time 0.7269 (0.5870) loss 6.8297 (7.4409) grad_norm 4.2208 (2.4870) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:15:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][550/625] eta 0:00:44 lr 0.000494 wd 0.0500 time 0.5725 (0.5882) data time 0.0008 (0.0015) model time 0.5717 (0.5882) loss 8.8251 (7.4467) grad_norm 2.9332 (2.4889) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:15:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][560/625] eta 0:00:38 lr 0.000494 wd 0.0500 time 0.5617 (0.5882) data time 0.0008 (0.0015) model time 0.5609 (0.5882) loss 7.6574 (7.4498) grad_norm 1.8089 (2.4826) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:15:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][570/625] eta 0:00:32 lr 0.000494 wd 0.0500 time 0.5778 (0.5880) data time 0.0008 (0.0015) model time 0.5770 (0.5880) loss 7.7107 (7.4475) grad_norm 2.4938 (2.4836) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:16:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][580/625] eta 0:00:26 lr 0.000494 wd 0.0500 time 0.5726 (0.5878) data time 0.0006 (0.0015) model time 0.5720 (0.5877) loss 6.1760 (7.4440) grad_norm 1.9556 (2.4822) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:16:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][590/625] eta 0:00:20 lr 0.000494 wd 0.0500 time 0.5717 (0.5875) data time 0.0006 (0.0015) model time 0.5711 (0.5874) loss 8.3569 (7.4404) grad_norm 2.9118 (2.4804) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:16:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][600/625] eta 0:00:14 lr 0.000494 wd 0.0500 time 0.5722 (0.5873) data time 0.0006 (0.0015) model time 0.5716 (0.5871) loss 6.6599 (7.4299) grad_norm 5.4780 (2.4888) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:16:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][610/625] eta 0:00:08 lr 0.000494 wd 0.0500 time 0.5666 (0.5871) data time 0.0006 (0.0014) model time 0.5660 (0.5869) loss 7.4018 (7.4277) grad_norm 1.8138 (2.4914) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:16:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [176/300][620/625] eta 0:00:02 lr 0.000493 wd 0.0500 time 0.5718 (0.5868) data time 0.0004 (0.0014) model time 0.5714 (0.5866) loss 8.5214 (7.4225) grad_norm 1.9418 (2.4892) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:16:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 176 training takes 0:06:06 +[2024-07-25 08:16:26 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 08:16:28 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 08:16:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.477 (0.477) Loss 0.5063 (0.5063) Acc@1 89.551 (89.551) Acc@5 98.779 (98.779) Mem 22339MB +[2024-07-25 08:16:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.7812 (0.6338) Acc@1 82.324 (86.692) Acc@5 96.094 (97.763) Mem 22339MB +[2024-07-25 08:16:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8989 (0.7414) Acc@1 78.271 (83.498) Acc@5 95.557 (96.738) Mem 22339MB +[2024-07-25 08:16:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.187 Acc@5 96.713 +[2024-07-25 08:16:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.2% +[2024-07-25 08:16:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.788 (0.788) Loss 0.4985 (0.4985) Acc@1 89.990 (89.990) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-25 08:16:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.186) Loss 0.7593 (0.6226) Acc@1 82.422 (86.958) Acc@5 96.533 (97.874) Mem 22339MB +[2024-07-25 08:16:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.157) Loss 0.8828 (0.7198) Acc@1 78.613 (83.968) Acc@5 95.801 (96.928) Mem 22339MB +[2024-07-25 08:16:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.555 Acc@5 96.919 +[2024-07-25 08:16:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.6% +[2024-07-25 08:16:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.56% +[2024-07-25 08:16:35 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 08:16:37 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 08:16:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][0/625] eta 0:09:32 lr 0.000493 wd 0.0500 time 0.9156 (0.9156) data time 0.3983 (0.3983) model time 0.0000 (0.0000) loss 7.7795 (7.7795) grad_norm 4.4991 (4.4991) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:16:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][10/625] eta 0:06:11 lr 0.000493 wd 0.0500 time 0.5715 (0.6043) data time 0.0006 (0.0369) model time 0.0000 (0.0000) loss 7.8172 (7.5809) grad_norm 2.7287 (2.8979) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:16:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][20/625] eta 0:05:57 lr 0.000493 wd 0.0500 time 0.5648 (0.5901) data time 0.0008 (0.0198) model time 0.0000 (0.0000) loss 7.5477 (7.4973) grad_norm 2.5945 (3.0241) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:16:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][30/625] eta 0:05:47 lr 0.000493 wd 0.0500 time 0.5729 (0.5848) data time 0.0008 (0.0137) model time 0.0000 (0.0000) loss 6.2841 (7.2257) grad_norm 2.5722 (2.8900) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:17:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][40/625] eta 0:05:40 lr 0.000493 wd 0.0500 time 0.5750 (0.5820) data time 0.0008 (0.0106) model time 0.0000 (0.0000) loss 8.9317 (7.1818) grad_norm 2.2422 (2.7249) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:17:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][50/625] eta 0:05:33 lr 0.000493 wd 0.0500 time 0.5713 (0.5802) data time 0.0006 (0.0087) model time 0.0000 (0.0000) loss 6.2724 (7.2359) grad_norm 2.7243 (2.6201) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:17:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][60/625] eta 0:05:27 lr 0.000493 wd 0.0500 time 0.5704 (0.5792) data time 0.0007 (0.0074) model time 0.5698 (0.5731) loss 7.4928 (7.3114) grad_norm 2.8416 (2.5362) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:17:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][70/625] eta 0:05:21 lr 0.000493 wd 0.0500 time 0.5746 (0.5785) data time 0.0006 (0.0065) model time 0.5740 (0.5733) loss 6.4875 (7.2868) grad_norm 1.7294 (2.4858) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:17:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][80/625] eta 0:05:15 lr 0.000493 wd 0.0500 time 0.5728 (0.5780) data time 0.0006 (0.0058) model time 0.5722 (0.5734) loss 7.8016 (7.3072) grad_norm 1.6646 (2.4438) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:17:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][90/625] eta 0:05:09 lr 0.000492 wd 0.0500 time 0.5736 (0.5777) data time 0.0006 (0.0053) model time 0.5730 (0.5735) loss 7.4388 (7.3281) grad_norm 1.9628 (2.4325) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:17:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][100/625] eta 0:05:03 lr 0.000492 wd 0.0500 time 0.5754 (0.5788) data time 0.0006 (0.0048) model time 0.5748 (0.5764) loss 6.6401 (7.3356) grad_norm 2.4356 (2.4137) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:17:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][110/625] eta 0:04:58 lr 0.000492 wd 0.0500 time 0.5734 (0.5798) data time 0.0006 (0.0045) model time 0.5728 (0.5785) loss 6.0113 (7.3427) grad_norm 2.3225 (2.4060) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:17:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][120/625] eta 0:04:54 lr 0.000492 wd 0.0500 time 0.7157 (0.5830) data time 0.0006 (0.0042) model time 0.7151 (0.5841) loss 6.5857 (7.3376) grad_norm 1.8763 (2.4651) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:17:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][130/625] eta 0:04:51 lr 0.000492 wd 0.0500 time 0.7092 (0.5882) data time 0.0008 (0.0039) model time 0.7084 (0.5924) loss 8.4910 (7.3494) grad_norm 4.7477 (2.4722) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:18:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][140/625] eta 0:04:46 lr 0.000492 wd 0.0500 time 0.5720 (0.5911) data time 0.0007 (0.0037) model time 0.5713 (0.5963) loss 8.5916 (7.3577) grad_norm 2.1238 (2.4665) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:18:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][150/625] eta 0:04:41 lr 0.000492 wd 0.0500 time 0.7272 (0.5926) data time 0.0008 (0.0035) model time 0.7265 (0.5980) loss 7.6837 (7.3409) grad_norm 1.8503 (2.4568) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:18:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][160/625] eta 0:04:35 lr 0.000492 wd 0.0500 time 0.5738 (0.5918) data time 0.0007 (0.0033) model time 0.5730 (0.5963) loss 7.8596 (7.3255) grad_norm 2.5666 (2.4692) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:18:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][170/625] eta 0:04:28 lr 0.000492 wd 0.0500 time 0.5714 (0.5908) data time 0.0006 (0.0032) model time 0.5708 (0.5944) loss 8.3729 (7.3351) grad_norm 1.8084 (2.4729) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:18:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][180/625] eta 0:04:22 lr 0.000492 wd 0.0500 time 0.5700 (0.5899) data time 0.0006 (0.0031) model time 0.5694 (0.5928) loss 6.5366 (7.3418) grad_norm 2.2185 (2.5201) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:18:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][190/625] eta 0:04:16 lr 0.000491 wd 0.0500 time 0.5725 (0.5891) data time 0.0006 (0.0029) model time 0.5719 (0.5914) loss 8.5891 (7.3616) grad_norm 3.1055 (2.5363) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:18:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][200/625] eta 0:04:10 lr 0.000491 wd 0.0500 time 0.5725 (0.5885) data time 0.0006 (0.0028) model time 0.5719 (0.5904) loss 8.0694 (7.3565) grad_norm 1.7764 (2.5602) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:18:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][210/625] eta 0:04:04 lr 0.000491 wd 0.0500 time 0.5744 (0.5880) data time 0.0008 (0.0027) model time 0.5736 (0.5897) loss 8.5238 (7.3919) grad_norm 1.8739 (2.5393) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:18:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][220/625] eta 0:03:57 lr 0.000491 wd 0.0500 time 0.5668 (0.5875) data time 0.0006 (0.0027) model time 0.5662 (0.5888) loss 8.6047 (7.3976) grad_norm 2.0969 (2.5172) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:18:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][230/625] eta 0:03:51 lr 0.000491 wd 0.0500 time 0.5731 (0.5869) data time 0.0008 (0.0026) model time 0.5723 (0.5879) loss 7.3076 (7.4064) grad_norm 1.9397 (2.5103) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:18:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][240/625] eta 0:03:45 lr 0.000491 wd 0.0500 time 0.5704 (0.5864) data time 0.0008 (0.0025) model time 0.5696 (0.5872) loss 7.7126 (7.4018) grad_norm 2.2061 (2.4964) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:19:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][250/625] eta 0:03:39 lr 0.000491 wd 0.0500 time 0.5740 (0.5859) data time 0.0006 (0.0024) model time 0.5734 (0.5865) loss 6.1527 (7.3983) grad_norm 2.1334 (2.4925) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:19:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][260/625] eta 0:03:33 lr 0.000491 wd 0.0500 time 0.5698 (0.5855) data time 0.0008 (0.0024) model time 0.5689 (0.5859) loss 6.9692 (7.4099) grad_norm 2.2469 (2.4875) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:19:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][270/625] eta 0:03:27 lr 0.000491 wd 0.0500 time 0.5739 (0.5851) data time 0.0006 (0.0023) model time 0.5733 (0.5854) loss 8.3285 (7.4194) grad_norm 2.0930 (2.4914) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:19:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][280/625] eta 0:03:21 lr 0.000490 wd 0.0500 time 0.5711 (0.5847) data time 0.0006 (0.0023) model time 0.5705 (0.5849) loss 6.2777 (7.4107) grad_norm 3.4498 (2.4827) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:19:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][290/625] eta 0:03:15 lr 0.000490 wd 0.0500 time 0.5739 (0.5844) data time 0.0008 (0.0022) model time 0.5731 (0.5844) loss 7.7937 (7.3998) grad_norm 1.8393 (2.4691) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:19:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][300/625] eta 0:03:09 lr 0.000490 wd 0.0500 time 0.5747 (0.5841) data time 0.0006 (0.0022) model time 0.5741 (0.5840) loss 7.8968 (7.4014) grad_norm 2.1777 (2.4578) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:19:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][310/625] eta 0:03:03 lr 0.000490 wd 0.0500 time 0.5717 (0.5838) data time 0.0006 (0.0021) model time 0.5710 (0.5837) loss 9.2676 (7.4116) grad_norm 1.8617 (2.4708) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:19:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][320/625] eta 0:02:58 lr 0.000490 wd 0.0500 time 0.5613 (0.5837) data time 0.0008 (0.0021) model time 0.5605 (0.5835) loss 7.0144 (7.3999) grad_norm 1.7178 (2.4856) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:19:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][330/625] eta 0:02:52 lr 0.000490 wd 0.0500 time 0.5686 (0.5839) data time 0.0008 (0.0021) model time 0.5678 (0.5837) loss 7.4360 (7.3953) grad_norm 3.2450 (2.4813) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:19:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][340/625] eta 0:02:46 lr 0.000490 wd 0.0500 time 0.7653 (0.5847) data time 0.0008 (0.0020) model time 0.7645 (0.5847) loss 8.3526 (7.3918) grad_norm 2.6209 (2.4768) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:20:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][350/625] eta 0:02:41 lr 0.000490 wd 0.0500 time 0.7333 (0.5861) data time 0.0008 (0.0020) model time 0.7325 (0.5863) loss 9.1667 (7.4001) grad_norm 2.0345 (2.4819) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:20:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][360/625] eta 0:02:35 lr 0.000490 wd 0.0500 time 0.7082 (0.5867) data time 0.0007 (0.0020) model time 0.7075 (0.5869) loss 8.4228 (7.3941) grad_norm 3.1174 (2.4779) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:20:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][370/625] eta 0:02:29 lr 0.000490 wd 0.0500 time 0.5701 (0.5872) data time 0.0008 (0.0019) model time 0.5693 (0.5875) loss 6.5392 (7.3881) grad_norm 2.0190 (2.4859) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:20:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][380/625] eta 0:02:23 lr 0.000489 wd 0.0500 time 0.5684 (0.5869) data time 0.0008 (0.0019) model time 0.5676 (0.5871) loss 7.5228 (7.3838) grad_norm 2.3087 (2.4822) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:20:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][390/625] eta 0:02:17 lr 0.000489 wd 0.0500 time 0.5704 (0.5866) data time 0.0006 (0.0019) model time 0.5699 (0.5867) loss 6.7075 (7.3780) grad_norm 2.1981 (2.4729) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:20:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][400/625] eta 0:02:11 lr 0.000489 wd 0.0500 time 0.5687 (0.5863) data time 0.0006 (0.0018) model time 0.5681 (0.5863) loss 7.0336 (7.3731) grad_norm 1.8019 (2.4669) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:20:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][410/625] eta 0:02:05 lr 0.000489 wd 0.0500 time 0.5640 (0.5860) data time 0.0009 (0.0018) model time 0.5631 (0.5859) loss 7.7757 (7.3719) grad_norm 2.4740 (2.4667) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:20:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][420/625] eta 0:02:00 lr 0.000489 wd 0.0500 time 0.5747 (0.5857) data time 0.0008 (0.0018) model time 0.5739 (0.5856) loss 6.2719 (7.3696) grad_norm 1.9078 (2.4647) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:20:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][430/625] eta 0:01:54 lr 0.000489 wd 0.0500 time 0.5714 (0.5856) data time 0.0006 (0.0018) model time 0.5708 (0.5854) loss 6.5235 (7.3698) grad_norm 2.9805 (2.4717) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:20:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][440/625] eta 0:01:48 lr 0.000489 wd 0.0500 time 0.5722 (0.5853) data time 0.0006 (0.0018) model time 0.5716 (0.5851) loss 5.8496 (7.3691) grad_norm 1.8944 (2.4820) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:21:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][450/625] eta 0:01:42 lr 0.000489 wd 0.0500 time 0.5758 (0.5851) data time 0.0008 (0.0017) model time 0.5750 (0.5848) loss 7.4037 (7.3646) grad_norm 2.6517 (2.4775) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:21:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][460/625] eta 0:01:36 lr 0.000489 wd 0.0500 time 0.5795 (0.5848) data time 0.0008 (0.0017) model time 0.5787 (0.5845) loss 6.6858 (7.3727) grad_norm 2.8408 (2.4797) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:21:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][470/625] eta 0:01:30 lr 0.000488 wd 0.0500 time 0.5723 (0.5846) data time 0.0006 (0.0017) model time 0.5717 (0.5843) loss 6.7551 (7.3722) grad_norm 2.1624 (2.4799) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:21:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][480/625] eta 0:01:24 lr 0.000488 wd 0.0500 time 0.5719 (0.5844) data time 0.0006 (0.0017) model time 0.5713 (0.5840) loss 8.6240 (7.3742) grad_norm 2.7425 (2.4857) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:21:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][490/625] eta 0:01:18 lr 0.000488 wd 0.0500 time 0.5700 (0.5842) data time 0.0006 (0.0017) model time 0.5693 (0.5838) loss 6.3833 (7.3729) grad_norm 2.6851 (2.4827) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:21:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][500/625] eta 0:01:13 lr 0.000488 wd 0.0500 time 0.5736 (0.5840) data time 0.0006 (0.0017) model time 0.5730 (0.5836) loss 6.4098 (7.3751) grad_norm 3.8922 (2.4842) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:21:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][510/625] eta 0:01:07 lr 0.000488 wd 0.0500 time 0.5703 (0.5838) data time 0.0008 (0.0016) model time 0.5694 (0.5834) loss 6.1882 (7.3761) grad_norm 2.7204 (2.4872) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:21:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][520/625] eta 0:01:01 lr 0.000488 wd 0.0500 time 0.5742 (0.5837) data time 0.0008 (0.0016) model time 0.5735 (0.5832) loss 8.5154 (7.3804) grad_norm 1.8626 (2.4849) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:21:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][530/625] eta 0:00:55 lr 0.000488 wd 0.0500 time 0.5760 (0.5836) data time 0.0008 (0.0016) model time 0.5752 (0.5831) loss 8.0430 (7.3897) grad_norm 2.4617 (2.4800) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:21:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][540/625] eta 0:00:49 lr 0.000488 wd 0.0500 time 0.5698 (0.5835) data time 0.0008 (0.0016) model time 0.5689 (0.5830) loss 7.7100 (7.3868) grad_norm 7.8887 (2.4935) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:21:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][550/625] eta 0:00:43 lr 0.000488 wd 0.0500 time 0.5722 (0.5834) data time 0.0008 (0.0016) model time 0.5714 (0.5829) loss 7.1384 (7.3811) grad_norm 3.8520 (2.5009) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:22:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][560/625] eta 0:00:37 lr 0.000488 wd 0.0500 time 0.7620 (0.5845) data time 0.0007 (0.0016) model time 0.7613 (0.5841) loss 7.7049 (7.3939) grad_norm 2.4671 (2.5046) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:22:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][570/625] eta 0:00:32 lr 0.000487 wd 0.0500 time 0.5731 (0.5852) data time 0.0008 (0.0016) model time 0.5723 (0.5848) loss 8.1791 (7.3941) grad_norm 1.8287 (2.5378) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:22:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][580/625] eta 0:00:26 lr 0.000487 wd 0.0500 time 0.7256 (0.5862) data time 0.0008 (0.0016) model time 0.7248 (0.5860) loss 8.6965 (7.3944) grad_norm 2.2073 (2.5434) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:22:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][590/625] eta 0:00:20 lr 0.000487 wd 0.0500 time 0.5670 (0.5870) data time 0.0006 (0.0015) model time 0.5664 (0.5868) loss 6.5282 (7.3882) grad_norm 2.2318 (2.5554) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:22:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][600/625] eta 0:00:14 lr 0.000487 wd 0.0500 time 0.5723 (0.5870) data time 0.0008 (0.0015) model time 0.5715 (0.5868) loss 6.2407 (7.3831) grad_norm 2.8162 (2.5561) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:22:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][610/625] eta 0:00:08 lr 0.000487 wd 0.0500 time 0.5687 (0.5869) data time 0.0006 (0.0015) model time 0.5681 (0.5866) loss 6.4187 (7.3949) grad_norm 2.0495 (2.5494) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:22:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [177/300][620/625] eta 0:00:02 lr 0.000487 wd 0.0500 time 0.5620 (0.5868) data time 0.0004 (0.0015) model time 0.5616 (0.5865) loss 6.2182 (7.3909) grad_norm 2.0667 (2.5454) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:22:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 177 training takes 0:06:06 +[2024-07-25 08:22:44 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 08:22:45 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 08:22:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.475 (0.475) Loss 0.5161 (0.5161) Acc@1 89.697 (89.697) Acc@5 98.828 (98.828) Mem 22339MB +[2024-07-25 08:22:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.7900 (0.6400) Acc@1 82.324 (86.692) Acc@5 96.533 (97.825) Mem 22339MB +[2024-07-25 08:22:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9238 (0.7466) Acc@1 77.588 (83.515) Acc@5 95.312 (96.768) Mem 22339MB +[2024-07-25 08:22:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.213 Acc@5 96.755 +[2024-07-25 08:22:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.2% +[2024-07-25 08:22:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.800 (0.800) Loss 0.4990 (0.4990) Acc@1 89.990 (89.990) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-25 08:22:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.187) Loss 0.7568 (0.6223) Acc@1 82.568 (86.981) Acc@5 96.582 (97.883) Mem 22339MB +[2024-07-25 08:22:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.157) Loss 0.8818 (0.7193) Acc@1 78.613 (83.994) Acc@5 95.801 (96.931) Mem 22339MB +[2024-07-25 08:22:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.587 Acc@5 96.925 +[2024-07-25 08:22:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.6% +[2024-07-25 08:22:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.59% +[2024-07-25 08:22:53 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 08:22:54 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 08:22:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][0/625] eta 0:08:54 lr 0.000487 wd 0.0500 time 0.8545 (0.8545) data time 0.3376 (0.3376) model time 0.0000 (0.0000) loss 7.3112 (7.3112) grad_norm 1.8969 (1.8969) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:23:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][10/625] eta 0:06:07 lr 0.000487 wd 0.0500 time 0.5703 (0.5983) data time 0.0008 (0.0319) model time 0.0000 (0.0000) loss 6.9785 (7.3931) grad_norm 1.8326 (1.8407) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:23:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][20/625] eta 0:05:55 lr 0.000487 wd 0.0500 time 0.5719 (0.5868) data time 0.0006 (0.0171) model time 0.0000 (0.0000) loss 7.8247 (7.3246) grad_norm 1.8319 (1.8861) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:23:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][30/625] eta 0:05:46 lr 0.000487 wd 0.0500 time 0.5720 (0.5828) data time 0.0008 (0.0119) model time 0.0000 (0.0000) loss 8.1642 (7.3934) grad_norm 1.8557 (1.9986) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:23:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][40/625] eta 0:05:39 lr 0.000486 wd 0.0500 time 0.5685 (0.5807) data time 0.0006 (0.0092) model time 0.0000 (0.0000) loss 5.7576 (7.3038) grad_norm 3.0772 (1.9930) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:23:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][50/625] eta 0:05:33 lr 0.000486 wd 0.0500 time 0.5734 (0.5797) data time 0.0009 (0.0076) model time 0.0000 (0.0000) loss 8.3701 (7.3852) grad_norm 1.6032 (2.0505) loss_scale 2048.0000 (1184.6275) mem 22339MB +[2024-07-25 08:23:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][60/625] eta 0:05:27 lr 0.000486 wd 0.0500 time 0.5718 (0.5788) data time 0.0006 (0.0065) model time 0.5712 (0.5734) loss 8.1608 (7.4435) grad_norm 2.1957 (2.1097) loss_scale 2048.0000 (1326.1639) mem 22339MB +[2024-07-25 08:23:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][70/625] eta 0:05:21 lr 0.000486 wd 0.0500 time 0.5780 (0.5786) data time 0.0006 (0.0057) model time 0.5774 (0.5749) loss 6.9795 (7.4325) grad_norm 2.4882 (2.1305) loss_scale 2048.0000 (1427.8310) mem 22339MB +[2024-07-25 08:23:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][80/625] eta 0:05:15 lr 0.000486 wd 0.0500 time 0.5728 (0.5781) data time 0.0006 (0.0051) model time 0.5723 (0.5745) loss 7.4672 (7.4324) grad_norm 1.7769 (2.1399) loss_scale 2048.0000 (1504.3951) mem 22339MB +[2024-07-25 08:23:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][90/625] eta 0:05:09 lr 0.000486 wd 0.0500 time 0.5737 (0.5777) data time 0.0009 (0.0046) model time 0.5728 (0.5744) loss 9.1197 (7.4109) grad_norm 2.9441 (2.2263) loss_scale 2048.0000 (1564.1319) mem 22339MB +[2024-07-25 08:23:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][100/625] eta 0:05:03 lr 0.000486 wd 0.0500 time 0.5750 (0.5775) data time 0.0008 (0.0042) model time 0.5742 (0.5744) loss 7.2433 (7.3949) grad_norm 2.5492 (2.2751) loss_scale 2048.0000 (1612.0396) mem 22339MB +[2024-07-25 08:23:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][110/625] eta 0:04:57 lr 0.000486 wd 0.0500 time 0.5756 (0.5773) data time 0.0008 (0.0039) model time 0.5749 (0.5745) loss 5.9576 (7.3877) grad_norm 1.6360 (2.2544) loss_scale 2048.0000 (1651.3153) mem 22339MB +[2024-07-25 08:24:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][120/625] eta 0:04:51 lr 0.000486 wd 0.0500 time 0.5633 (0.5771) data time 0.0006 (0.0037) model time 0.5627 (0.5743) loss 7.3612 (7.3981) grad_norm 2.3062 (2.2456) loss_scale 2048.0000 (1684.0992) mem 22339MB +[2024-07-25 08:24:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][130/625] eta 0:04:45 lr 0.000485 wd 0.0500 time 0.5751 (0.5770) data time 0.0006 (0.0035) model time 0.5746 (0.5744) loss 6.9457 (7.3964) grad_norm 2.1023 (2.2941) loss_scale 2048.0000 (1711.8779) mem 22339MB +[2024-07-25 08:24:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][140/625] eta 0:04:39 lr 0.000485 wd 0.0500 time 0.5727 (0.5771) data time 0.0008 (0.0033) model time 0.5719 (0.5749) loss 8.1757 (7.3838) grad_norm 1.9211 (2.2860) loss_scale 2048.0000 (1735.7163) mem 22339MB +[2024-07-25 08:24:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][150/625] eta 0:04:34 lr 0.000485 wd 0.0500 time 0.5685 (0.5781) data time 0.0008 (0.0031) model time 0.5677 (0.5764) loss 8.7325 (7.4024) grad_norm 3.2930 (2.3052) loss_scale 2048.0000 (1756.3974) mem 22339MB +[2024-07-25 08:24:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][160/625] eta 0:04:31 lr 0.000485 wd 0.0500 time 0.5735 (0.5830) data time 0.0008 (0.0030) model time 0.5727 (0.5836) loss 8.1502 (7.3941) grad_norm 2.3888 (2.3142) loss_scale 2048.0000 (1774.5093) mem 22339MB +[2024-07-25 08:24:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][170/625] eta 0:04:27 lr 0.000485 wd 0.0500 time 0.5635 (0.5871) data time 0.0006 (0.0029) model time 0.5629 (0.5895) loss 6.1241 (7.3995) grad_norm 1.8114 (2.3127) loss_scale 2048.0000 (1790.5029) mem 22339MB +[2024-07-25 08:24:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][180/625] eta 0:04:22 lr 0.000485 wd 0.0500 time 0.6402 (0.5900) data time 0.0006 (0.0027) model time 0.6395 (0.5932) loss 6.4148 (7.4032) grad_norm 1.9711 (2.3230) loss_scale 2048.0000 (1804.7293) mem 22339MB +[2024-07-25 08:24:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][190/625] eta 0:04:16 lr 0.000485 wd 0.0500 time 0.5691 (0.5902) data time 0.0008 (0.0026) model time 0.5684 (0.5932) loss 7.7812 (7.3963) grad_norm 2.1469 (2.3494) loss_scale 2048.0000 (1817.4660) mem 22339MB +[2024-07-25 08:24:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][200/625] eta 0:04:10 lr 0.000485 wd 0.0500 time 0.5724 (0.5899) data time 0.0007 (0.0026) model time 0.5717 (0.5926) loss 8.4161 (7.4205) grad_norm 1.8286 (2.3372) loss_scale 2048.0000 (1828.9353) mem 22339MB +[2024-07-25 08:24:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][210/625] eta 0:04:04 lr 0.000485 wd 0.0500 time 0.5910 (0.5892) data time 0.0009 (0.0025) model time 0.5901 (0.5914) loss 7.8734 (7.4251) grad_norm 1.4489 (2.3264) loss_scale 2048.0000 (1839.3175) mem 22339MB +[2024-07-25 08:25:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][220/625] eta 0:03:58 lr 0.000485 wd 0.0500 time 0.5726 (0.5884) data time 0.0006 (0.0024) model time 0.5719 (0.5902) loss 7.1897 (7.4336) grad_norm 1.9303 (2.3246) loss_scale 2048.0000 (1848.7602) mem 22339MB +[2024-07-25 08:25:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][230/625] eta 0:03:52 lr 0.000484 wd 0.0500 time 0.5723 (0.5878) data time 0.0008 (0.0023) model time 0.5715 (0.5892) loss 8.2335 (7.4438) grad_norm 2.4650 (2.3523) loss_scale 2048.0000 (1857.3853) mem 22339MB +[2024-07-25 08:25:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][240/625] eta 0:03:46 lr 0.000484 wd 0.0500 time 0.5729 (0.5873) data time 0.0006 (0.0023) model time 0.5723 (0.5885) loss 6.2940 (7.4252) grad_norm 2.9657 (2.3725) loss_scale 2048.0000 (1865.2946) mem 22339MB +[2024-07-25 08:25:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][250/625] eta 0:03:40 lr 0.000484 wd 0.0500 time 0.5716 (0.5868) data time 0.0006 (0.0022) model time 0.5710 (0.5878) loss 7.3246 (7.4095) grad_norm 1.7731 (2.3689) loss_scale 2048.0000 (1872.5737) mem 22339MB +[2024-07-25 08:25:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][260/625] eta 0:03:34 lr 0.000484 wd 0.0500 time 0.5750 (0.5864) data time 0.0006 (0.0022) model time 0.5743 (0.5871) loss 5.9226 (7.3947) grad_norm 2.2672 (2.3902) loss_scale 2048.0000 (1879.2950) mem 22339MB +[2024-07-25 08:25:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][270/625] eta 0:03:28 lr 0.000484 wd 0.0500 time 0.5755 (0.5859) data time 0.0008 (0.0021) model time 0.5748 (0.5865) loss 7.6840 (7.3983) grad_norm 2.2878 (inf) loss_scale 1024.0000 (1877.9631) mem 22339MB +[2024-07-25 08:25:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][280/625] eta 0:03:21 lr 0.000484 wd 0.0500 time 0.5736 (0.5855) data time 0.0006 (0.0021) model time 0.5730 (0.5859) loss 6.8121 (7.4007) grad_norm 2.5874 (inf) loss_scale 1024.0000 (1847.5730) mem 22339MB +[2024-07-25 08:25:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][290/625] eta 0:03:16 lr 0.000484 wd 0.0500 time 0.5678 (0.5851) data time 0.0006 (0.0020) model time 0.5672 (0.5854) loss 5.9016 (7.4056) grad_norm 3.2075 (inf) loss_scale 512.0000 (1815.7526) mem 22339MB +[2024-07-25 08:25:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][300/625] eta 0:03:10 lr 0.000484 wd 0.0500 time 0.5733 (0.5848) data time 0.0006 (0.0020) model time 0.5727 (0.5849) loss 8.2388 (7.4049) grad_norm 1.7282 (inf) loss_scale 512.0000 (1772.4385) mem 22339MB +[2024-07-25 08:25:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][310/625] eta 0:03:04 lr 0.000484 wd 0.0500 time 0.5738 (0.5845) data time 0.0006 (0.0020) model time 0.5732 (0.5846) loss 7.4358 (7.4113) grad_norm 2.1366 (inf) loss_scale 512.0000 (1731.9100) mem 22339MB +[2024-07-25 08:26:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][320/625] eta 0:02:58 lr 0.000484 wd 0.0500 time 0.5726 (0.5842) data time 0.0006 (0.0019) model time 0.5720 (0.5842) loss 5.7601 (7.3988) grad_norm 2.4322 (inf) loss_scale 512.0000 (1693.9065) mem 22339MB +[2024-07-25 08:26:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][330/625] eta 0:02:52 lr 0.000483 wd 0.0500 time 0.5734 (0.5839) data time 0.0006 (0.0019) model time 0.5728 (0.5839) loss 6.9954 (7.4020) grad_norm 2.7264 (inf) loss_scale 512.0000 (1658.1994) mem 22339MB +[2024-07-25 08:26:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][340/625] eta 0:02:46 lr 0.000483 wd 0.0500 time 0.5696 (0.5837) data time 0.0008 (0.0019) model time 0.5688 (0.5835) loss 8.6036 (7.4126) grad_norm 1.7934 (inf) loss_scale 512.0000 (1624.5865) mem 22339MB +[2024-07-25 08:26:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][350/625] eta 0:02:40 lr 0.000483 wd 0.0500 time 0.5740 (0.5834) data time 0.0008 (0.0018) model time 0.5732 (0.5832) loss 8.2343 (7.4301) grad_norm 2.0005 (inf) loss_scale 512.0000 (1592.8889) mem 22339MB +[2024-07-25 08:26:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][360/625] eta 0:02:34 lr 0.000483 wd 0.0500 time 0.5742 (0.5833) data time 0.0008 (0.0018) model time 0.5734 (0.5830) loss 7.6814 (7.4231) grad_norm 1.8223 (inf) loss_scale 512.0000 (1562.9474) mem 22339MB +[2024-07-25 08:26:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][370/625] eta 0:02:28 lr 0.000483 wd 0.0500 time 0.5728 (0.5835) data time 0.0006 (0.0018) model time 0.5722 (0.5832) loss 6.9953 (7.4317) grad_norm 2.0324 (inf) loss_scale 512.0000 (1534.6199) mem 22339MB +[2024-07-25 08:26:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][380/625] eta 0:02:23 lr 0.000483 wd 0.0500 time 0.6866 (0.5853) data time 0.0008 (0.0018) model time 0.6858 (0.5853) loss 8.3689 (7.4363) grad_norm 3.3802 (inf) loss_scale 512.0000 (1507.7795) mem 22339MB +[2024-07-25 08:26:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][390/625] eta 0:02:17 lr 0.000483 wd 0.0500 time 0.6930 (0.5869) data time 0.0006 (0.0017) model time 0.6923 (0.5871) loss 7.2692 (7.4254) grad_norm 3.1001 (inf) loss_scale 512.0000 (1482.3120) mem 22339MB +[2024-07-25 08:26:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][400/625] eta 0:02:12 lr 0.000483 wd 0.0500 time 0.7527 (0.5885) data time 0.0006 (0.0017) model time 0.7521 (0.5889) loss 7.0489 (7.4198) grad_norm 1.8152 (inf) loss_scale 512.0000 (1458.1147) mem 22339MB +[2024-07-25 08:26:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][410/625] eta 0:02:06 lr 0.000483 wd 0.0500 time 0.5718 (0.5885) data time 0.0008 (0.0017) model time 0.5709 (0.5889) loss 7.9223 (7.4183) grad_norm 3.6381 (inf) loss_scale 512.0000 (1435.0949) mem 22339MB +[2024-07-25 08:27:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][420/625] eta 0:02:00 lr 0.000482 wd 0.0500 time 0.5691 (0.5883) data time 0.0006 (0.0017) model time 0.5685 (0.5886) loss 7.4640 (7.4214) grad_norm 1.9078 (inf) loss_scale 512.0000 (1413.1686) mem 22339MB +[2024-07-25 08:27:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][430/625] eta 0:01:54 lr 0.000482 wd 0.0500 time 0.5732 (0.5879) data time 0.0009 (0.0017) model time 0.5723 (0.5882) loss 8.2905 (7.4303) grad_norm 2.1769 (inf) loss_scale 512.0000 (1392.2599) mem 22339MB +[2024-07-25 08:27:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][440/625] eta 0:01:48 lr 0.000482 wd 0.0500 time 0.5709 (0.5877) data time 0.0008 (0.0016) model time 0.5701 (0.5878) loss 7.3985 (7.4357) grad_norm 2.0910 (inf) loss_scale 512.0000 (1372.2993) mem 22339MB +[2024-07-25 08:27:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][450/625] eta 0:01:42 lr 0.000482 wd 0.0500 time 0.5734 (0.5874) data time 0.0008 (0.0016) model time 0.5726 (0.5875) loss 7.7864 (7.4364) grad_norm 2.5074 (inf) loss_scale 512.0000 (1353.2239) mem 22339MB +[2024-07-25 08:27:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][460/625] eta 0:01:36 lr 0.000482 wd 0.0500 time 0.5718 (0.5871) data time 0.0008 (0.0016) model time 0.5710 (0.5872) loss 7.0565 (7.4346) grad_norm 2.3004 (inf) loss_scale 512.0000 (1334.9761) mem 22339MB +[2024-07-25 08:27:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][470/625] eta 0:01:30 lr 0.000482 wd 0.0500 time 0.5753 (0.5869) data time 0.0006 (0.0016) model time 0.5747 (0.5869) loss 8.5320 (7.4306) grad_norm 2.4293 (inf) loss_scale 512.0000 (1317.5032) mem 22339MB +[2024-07-25 08:27:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][480/625] eta 0:01:25 lr 0.000482 wd 0.0500 time 0.5744 (0.5866) data time 0.0008 (0.0016) model time 0.5736 (0.5866) loss 7.5915 (7.4354) grad_norm 2.4827 (inf) loss_scale 512.0000 (1300.7568) mem 22339MB +[2024-07-25 08:27:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][490/625] eta 0:01:19 lr 0.000482 wd 0.0500 time 0.5709 (0.5864) data time 0.0008 (0.0016) model time 0.5701 (0.5863) loss 7.5994 (7.4398) grad_norm 2.2735 (inf) loss_scale 512.0000 (1284.6925) mem 22339MB +[2024-07-25 08:27:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][500/625] eta 0:01:13 lr 0.000482 wd 0.0500 time 0.5708 (0.5861) data time 0.0008 (0.0016) model time 0.5700 (0.5860) loss 7.4558 (7.4376) grad_norm 2.6293 (inf) loss_scale 512.0000 (1269.2695) mem 22339MB +[2024-07-25 08:27:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][510/625] eta 0:01:07 lr 0.000482 wd 0.0500 time 0.5713 (0.5859) data time 0.0006 (0.0015) model time 0.5707 (0.5857) loss 7.0883 (7.4402) grad_norm 2.1939 (inf) loss_scale 512.0000 (1254.4501) mem 22339MB +[2024-07-25 08:27:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][520/625] eta 0:01:01 lr 0.000481 wd 0.0500 time 0.5696 (0.5857) data time 0.0008 (0.0015) model time 0.5688 (0.5854) loss 9.3875 (7.4516) grad_norm 2.2237 (inf) loss_scale 512.0000 (1240.1996) mem 22339MB +[2024-07-25 08:28:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][530/625] eta 0:00:55 lr 0.000481 wd 0.0500 time 0.5719 (0.5854) data time 0.0006 (0.0015) model time 0.5712 (0.5852) loss 7.1400 (7.4528) grad_norm 2.0567 (inf) loss_scale 512.0000 (1226.4859) mem 22339MB +[2024-07-25 08:28:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][540/625] eta 0:00:49 lr 0.000481 wd 0.0500 time 0.5720 (0.5852) data time 0.0007 (0.0015) model time 0.5713 (0.5849) loss 6.1484 (7.4418) grad_norm 2.1285 (inf) loss_scale 512.0000 (1213.2791) mem 22339MB +[2024-07-25 08:28:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][550/625] eta 0:00:43 lr 0.000481 wd 0.0500 time 0.5744 (0.5850) data time 0.0006 (0.0015) model time 0.5738 (0.5847) loss 6.9858 (7.4358) grad_norm 3.7325 (inf) loss_scale 512.0000 (1200.5517) mem 22339MB +[2024-07-25 08:28:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][560/625] eta 0:00:38 lr 0.000481 wd 0.0500 time 0.5637 (0.5848) data time 0.0007 (0.0015) model time 0.5630 (0.5845) loss 8.1193 (7.4388) grad_norm 1.7017 (inf) loss_scale 512.0000 (1188.2781) mem 22339MB +[2024-07-25 08:28:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][570/625] eta 0:00:32 lr 0.000481 wd 0.0500 time 0.5620 (0.5846) data time 0.0008 (0.0015) model time 0.5612 (0.5843) loss 8.1478 (7.4398) grad_norm 1.9492 (inf) loss_scale 512.0000 (1176.4343) mem 22339MB +[2024-07-25 08:28:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][580/625] eta 0:00:26 lr 0.000481 wd 0.0500 time 0.5724 (0.5847) data time 0.0006 (0.0015) model time 0.5718 (0.5843) loss 8.1162 (7.4326) grad_norm 2.5705 (inf) loss_scale 512.0000 (1164.9983) mem 22339MB +[2024-07-25 08:28:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][590/625] eta 0:00:20 lr 0.000481 wd 0.0500 time 0.5741 (0.5847) data time 0.0008 (0.0015) model time 0.5733 (0.5843) loss 8.1563 (7.4372) grad_norm 1.7405 (inf) loss_scale 512.0000 (1153.9492) mem 22339MB +[2024-07-25 08:28:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][600/625] eta 0:00:14 lr 0.000481 wd 0.0500 time 0.7135 (0.5857) data time 0.0006 (0.0014) model time 0.7129 (0.5853) loss 7.8714 (7.4409) grad_norm 1.9480 (inf) loss_scale 512.0000 (1143.2679) mem 22339MB +[2024-07-25 08:28:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][610/625] eta 0:00:08 lr 0.000480 wd 0.0500 time 0.7022 (0.5863) data time 0.0006 (0.0014) model time 0.7016 (0.5860) loss 7.6783 (7.4426) grad_norm 2.8547 (inf) loss_scale 512.0000 (1132.9362) mem 22339MB +[2024-07-25 08:28:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [178/300][620/625] eta 0:00:02 lr 0.000480 wd 0.0500 time 0.7657 (0.5867) data time 0.0006 (0.0014) model time 0.7651 (0.5865) loss 7.7476 (7.4456) grad_norm 8.5679 (inf) loss_scale 512.0000 (1122.9372) mem 22339MB +[2024-07-25 08:29:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 178 training takes 0:06:06 +[2024-07-25 08:29:01 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 08:29:02 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 08:29:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.480 (0.480) Loss 0.5103 (0.5103) Acc@1 89.697 (89.697) Acc@5 98.828 (98.828) Mem 22339MB +[2024-07-25 08:29:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.7642 (0.6326) Acc@1 82.275 (86.697) Acc@5 96.289 (97.852) Mem 22339MB +[2024-07-25 08:29:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.8823 (0.7341) Acc@1 78.613 (83.570) Acc@5 95.654 (96.828) Mem 22339MB +[2024-07-25 08:29:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.171 Acc@5 96.793 +[2024-07-25 08:29:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.2% +[2024-07-25 08:29:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.861 (0.861) Loss 0.4988 (0.4988) Acc@1 89.990 (89.990) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-25 08:29:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.192) Loss 0.7559 (0.6222) Acc@1 82.617 (86.998) Acc@5 96.582 (97.909) Mem 22339MB +[2024-07-25 08:29:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.161) Loss 0.8804 (0.7191) Acc@1 78.613 (84.005) Acc@5 95.801 (96.952) Mem 22339MB +[2024-07-25 08:29:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.613 Acc@5 96.945 +[2024-07-25 08:29:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.6% +[2024-07-25 08:29:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.61% +[2024-07-25 08:29:10 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 08:29:11 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 08:29:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][0/625] eta 0:09:06 lr 0.000480 wd 0.0500 time 0.8742 (0.8742) data time 0.3570 (0.3570) model time 0.0000 (0.0000) loss 7.8170 (7.8170) grad_norm 2.4861 (2.4861) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:29:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][10/625] eta 0:06:18 lr 0.000480 wd 0.0500 time 0.6227 (0.6149) data time 0.0006 (0.0332) model time 0.0000 (0.0000) loss 6.3576 (7.2127) grad_norm 2.5606 (2.6755) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:29:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][20/625] eta 0:05:59 lr 0.000480 wd 0.0500 time 0.5663 (0.5948) data time 0.0009 (0.0178) model time 0.0000 (0.0000) loss 7.5087 (7.4019) grad_norm 1.9394 (2.4547) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:29:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][30/625] eta 0:05:49 lr 0.000480 wd 0.0500 time 0.5746 (0.5875) data time 0.0008 (0.0123) model time 0.0000 (0.0000) loss 5.6916 (7.2469) grad_norm 1.6791 (2.4002) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:29:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][40/625] eta 0:05:41 lr 0.000480 wd 0.0500 time 0.5694 (0.5839) data time 0.0008 (0.0095) model time 0.0000 (0.0000) loss 8.9979 (7.2640) grad_norm 2.5550 (2.3439) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:29:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][50/625] eta 0:05:34 lr 0.000480 wd 0.0500 time 0.5710 (0.5818) data time 0.0006 (0.0078) model time 0.0000 (0.0000) loss 6.8772 (7.2411) grad_norm 2.2866 (2.3237) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:29:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][60/625] eta 0:05:28 lr 0.000480 wd 0.0500 time 0.5738 (0.5808) data time 0.0008 (0.0067) model time 0.5729 (0.5746) loss 7.7292 (7.2879) grad_norm 2.7639 (2.3482) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:29:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][70/625] eta 0:05:21 lr 0.000480 wd 0.0500 time 0.5708 (0.5798) data time 0.0007 (0.0059) model time 0.5702 (0.5737) loss 8.3582 (7.3303) grad_norm 2.6223 (2.3239) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:29:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][80/625] eta 0:05:15 lr 0.000479 wd 0.0500 time 0.5703 (0.5790) data time 0.0008 (0.0053) model time 0.5695 (0.5733) loss 7.5517 (7.3317) grad_norm 2.9254 (2.3532) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:30:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][90/625] eta 0:05:09 lr 0.000479 wd 0.0500 time 0.5687 (0.5783) data time 0.0009 (0.0048) model time 0.5679 (0.5731) loss 7.5562 (7.2992) grad_norm 3.3731 (2.3973) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:30:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][100/625] eta 0:05:03 lr 0.000479 wd 0.0500 time 0.5746 (0.5778) data time 0.0006 (0.0044) model time 0.5741 (0.5730) loss 8.8135 (7.3150) grad_norm 3.3549 (2.4380) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:30:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][110/625] eta 0:04:57 lr 0.000479 wd 0.0500 time 0.5679 (0.5774) data time 0.0007 (0.0041) model time 0.5672 (0.5729) loss 8.7346 (7.3057) grad_norm 3.4404 (2.4758) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:30:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][120/625] eta 0:04:51 lr 0.000479 wd 0.0500 time 0.5738 (0.5772) data time 0.0006 (0.0038) model time 0.5732 (0.5731) loss 7.2308 (7.3094) grad_norm 2.3705 (2.4937) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:30:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][130/625] eta 0:04:45 lr 0.000479 wd 0.0500 time 0.5786 (0.5770) data time 0.0007 (0.0035) model time 0.5779 (0.5732) loss 8.1294 (7.3369) grad_norm 2.4208 (2.5101) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:30:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][140/625] eta 0:04:39 lr 0.000479 wd 0.0500 time 0.5738 (0.5768) data time 0.0006 (0.0033) model time 0.5732 (0.5732) loss 7.7453 (7.3412) grad_norm 2.8819 (2.5168) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:30:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][150/625] eta 0:04:33 lr 0.000479 wd 0.0500 time 0.5742 (0.5767) data time 0.0008 (0.0032) model time 0.5734 (0.5733) loss 8.9931 (7.3544) grad_norm 2.3816 (2.5067) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:30:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][160/625] eta 0:04:28 lr 0.000479 wd 0.0500 time 0.5732 (0.5775) data time 0.0007 (0.0030) model time 0.5725 (0.5747) loss 6.8060 (7.3709) grad_norm 2.9920 (2.5141) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:30:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][170/625] eta 0:04:22 lr 0.000479 wd 0.0500 time 0.5721 (0.5774) data time 0.0006 (0.0029) model time 0.5716 (0.5747) loss 6.0986 (7.3864) grad_norm 2.6252 (2.5146) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:30:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][180/625] eta 0:04:17 lr 0.000478 wd 0.0500 time 0.5715 (0.5786) data time 0.0006 (0.0028) model time 0.5709 (0.5766) loss 8.9674 (7.3840) grad_norm 2.0091 (2.5075) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:31:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][190/625] eta 0:04:12 lr 0.000478 wd 0.0500 time 0.5732 (0.5795) data time 0.0006 (0.0027) model time 0.5726 (0.5779) loss 7.4818 (7.3821) grad_norm 1.9115 (2.4932) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:31:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][200/625] eta 0:04:07 lr 0.000478 wd 0.0500 time 0.5675 (0.5823) data time 0.0006 (0.0026) model time 0.5669 (0.5816) loss 6.6857 (7.3659) grad_norm 1.6727 (2.4979) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:31:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][210/625] eta 0:04:02 lr 0.000478 wd 0.0500 time 0.5734 (0.5840) data time 0.0008 (0.0025) model time 0.5726 (0.5838) loss 6.7976 (7.3476) grad_norm 2.1478 (2.4932) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:31:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][220/625] eta 0:03:57 lr 0.000478 wd 0.0500 time 0.5726 (0.5862) data time 0.0006 (0.0024) model time 0.5721 (0.5867) loss 6.1058 (7.3386) grad_norm 2.4230 (2.4810) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:31:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][230/625] eta 0:03:51 lr 0.000478 wd 0.0500 time 0.5696 (0.5857) data time 0.0006 (0.0024) model time 0.5690 (0.5860) loss 8.5974 (7.3444) grad_norm 1.5606 (2.4686) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:31:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][240/625] eta 0:03:45 lr 0.000478 wd 0.0500 time 0.5702 (0.5854) data time 0.0006 (0.0023) model time 0.5696 (0.5855) loss 7.4164 (7.3647) grad_norm 2.1845 (2.4664) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:31:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][250/625] eta 0:03:39 lr 0.000478 wd 0.0500 time 0.5746 (0.5849) data time 0.0006 (0.0022) model time 0.5740 (0.5848) loss 6.2614 (7.3604) grad_norm 1.6283 (2.4503) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:31:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][260/625] eta 0:03:33 lr 0.000478 wd 0.0500 time 0.5621 (0.5845) data time 0.0008 (0.0022) model time 0.5613 (0.5843) loss 7.0743 (7.3528) grad_norm 2.3857 (2.4497) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:31:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][270/625] eta 0:03:27 lr 0.000478 wd 0.0500 time 0.5759 (0.5842) data time 0.0006 (0.0022) model time 0.5752 (0.5839) loss 8.2092 (7.3565) grad_norm 1.8054 (2.4351) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:31:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][280/625] eta 0:03:21 lr 0.000477 wd 0.0500 time 0.5694 (0.5838) data time 0.0009 (0.0021) model time 0.5685 (0.5835) loss 8.3707 (7.3462) grad_norm 3.4308 (2.4319) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:32:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][290/625] eta 0:03:15 lr 0.000477 wd 0.0500 time 0.5749 (0.5835) data time 0.0008 (0.0021) model time 0.5741 (0.5831) loss 8.6940 (7.3626) grad_norm 1.9962 (2.4360) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:32:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][300/625] eta 0:03:09 lr 0.000477 wd 0.0500 time 0.5692 (0.5832) data time 0.0008 (0.0020) model time 0.5684 (0.5827) loss 8.3721 (7.3601) grad_norm 2.5959 (2.4282) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:32:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][310/625] eta 0:03:03 lr 0.000477 wd 0.0500 time 0.5742 (0.5830) data time 0.0008 (0.0020) model time 0.5733 (0.5823) loss 5.9242 (7.3742) grad_norm 2.2456 (2.4378) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:32:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][320/625] eta 0:02:57 lr 0.000477 wd 0.0500 time 0.5719 (0.5827) data time 0.0008 (0.0020) model time 0.5711 (0.5820) loss 8.7732 (7.3718) grad_norm 3.5250 (2.4308) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:32:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][330/625] eta 0:02:51 lr 0.000477 wd 0.0500 time 0.5695 (0.5824) data time 0.0007 (0.0019) model time 0.5688 (0.5817) loss 8.1413 (7.3766) grad_norm 2.0775 (2.4252) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:32:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][340/625] eta 0:02:45 lr 0.000477 wd 0.0500 time 0.5723 (0.5822) data time 0.0006 (0.0019) model time 0.5717 (0.5814) loss 7.2736 (7.3701) grad_norm 1.6677 (2.4210) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:32:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][350/625] eta 0:02:40 lr 0.000477 wd 0.0500 time 0.5731 (0.5820) data time 0.0007 (0.0019) model time 0.5725 (0.5811) loss 8.1328 (7.3736) grad_norm 5.9251 (2.4253) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:32:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][360/625] eta 0:02:34 lr 0.000477 wd 0.0500 time 0.5721 (0.5818) data time 0.0007 (0.0018) model time 0.5715 (0.5809) loss 6.8287 (7.3688) grad_norm 17.6632 (2.4623) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:32:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][370/625] eta 0:02:28 lr 0.000476 wd 0.0500 time 0.5747 (0.5816) data time 0.0006 (0.0018) model time 0.5740 (0.5807) loss 6.9650 (7.3716) grad_norm 4.5977 (2.4695) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:32:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][380/625] eta 0:02:22 lr 0.000476 wd 0.0500 time 0.5707 (0.5816) data time 0.0007 (0.0018) model time 0.5701 (0.5807) loss 7.1233 (7.3738) grad_norm 1.6773 (2.4615) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:32:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][390/625] eta 0:02:16 lr 0.000476 wd 0.0500 time 0.6349 (0.5816) data time 0.0006 (0.0018) model time 0.6343 (0.5807) loss 7.2872 (7.3778) grad_norm 4.1511 (2.4702) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:33:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][400/625] eta 0:02:10 lr 0.000476 wd 0.0500 time 0.5735 (0.5818) data time 0.0006 (0.0017) model time 0.5728 (0.5810) loss 6.1851 (7.3807) grad_norm 3.0812 (2.4694) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:33:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][410/625] eta 0:02:05 lr 0.000476 wd 0.0500 time 0.7405 (0.5826) data time 0.0007 (0.0017) model time 0.7398 (0.5818) loss 6.2047 (7.3789) grad_norm 1.6259 (2.4648) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:33:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][420/625] eta 0:01:59 lr 0.000476 wd 0.0500 time 0.5685 (0.5839) data time 0.0009 (0.0017) model time 0.5676 (0.5833) loss 7.6285 (7.3773) grad_norm 2.6549 (2.4615) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:33:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][430/625] eta 0:01:54 lr 0.000476 wd 0.0500 time 0.5732 (0.5850) data time 0.0008 (0.0017) model time 0.5724 (0.5845) loss 8.4669 (7.3799) grad_norm 2.7452 (2.4777) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:33:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][440/625] eta 0:01:48 lr 0.000476 wd 0.0500 time 0.6694 (0.5855) data time 0.0006 (0.0017) model time 0.6687 (0.5852) loss 7.9207 (7.3833) grad_norm 1.9067 (2.4697) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:33:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][450/625] eta 0:01:42 lr 0.000476 wd 0.0500 time 0.5703 (0.5853) data time 0.0008 (0.0017) model time 0.5695 (0.5849) loss 7.1746 (7.3788) grad_norm 2.0086 (2.4653) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:33:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][460/625] eta 0:01:36 lr 0.000476 wd 0.0500 time 0.5736 (0.5851) data time 0.0006 (0.0016) model time 0.5730 (0.5846) loss 6.6114 (7.3696) grad_norm 4.3164 (2.4731) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:33:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][470/625] eta 0:01:30 lr 0.000475 wd 0.0500 time 0.5738 (0.5848) data time 0.0008 (0.0016) model time 0.5729 (0.5843) loss 7.8193 (7.3655) grad_norm 3.5457 (2.4762) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:33:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][480/625] eta 0:01:24 lr 0.000475 wd 0.0500 time 0.5736 (0.5846) data time 0.0008 (0.0016) model time 0.5728 (0.5841) loss 7.6865 (7.3688) grad_norm 3.1965 (2.4719) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:33:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][490/625] eta 0:01:18 lr 0.000475 wd 0.0500 time 0.5709 (0.5844) data time 0.0006 (0.0016) model time 0.5703 (0.5838) loss 7.0394 (7.3689) grad_norm 2.7069 (2.4725) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:34:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][500/625] eta 0:01:13 lr 0.000475 wd 0.0500 time 0.5711 (0.5842) data time 0.0007 (0.0016) model time 0.5704 (0.5836) loss 6.0112 (7.3581) grad_norm 1.9852 (2.4690) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:34:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][510/625] eta 0:01:07 lr 0.000475 wd 0.0500 time 0.5697 (0.5840) data time 0.0007 (0.0016) model time 0.5690 (0.5834) loss 8.0717 (7.3597) grad_norm 1.7108 (2.4664) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:34:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][520/625] eta 0:01:01 lr 0.000475 wd 0.0500 time 0.5734 (0.5838) data time 0.0006 (0.0015) model time 0.5728 (0.5832) loss 6.8135 (7.3592) grad_norm 2.3633 (2.4614) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:34:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][530/625] eta 0:00:55 lr 0.000475 wd 0.0500 time 0.5726 (0.5836) data time 0.0006 (0.0015) model time 0.5720 (0.5829) loss 6.1847 (7.3545) grad_norm 2.2098 (2.4567) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:34:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][540/625] eta 0:00:49 lr 0.000475 wd 0.0500 time 0.5715 (0.5834) data time 0.0007 (0.0015) model time 0.5708 (0.5828) loss 8.4592 (7.3513) grad_norm 3.5307 (2.4552) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:34:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][550/625] eta 0:00:43 lr 0.000475 wd 0.0500 time 0.5748 (0.5833) data time 0.0007 (0.0015) model time 0.5741 (0.5826) loss 7.5867 (7.3522) grad_norm 1.5919 (2.4497) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:34:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][560/625] eta 0:00:37 lr 0.000474 wd 0.0500 time 0.5738 (0.5831) data time 0.0008 (0.0015) model time 0.5730 (0.5824) loss 8.4931 (7.3564) grad_norm 1.9214 (2.4431) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:34:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][570/625] eta 0:00:32 lr 0.000474 wd 0.0500 time 0.5730 (0.5830) data time 0.0006 (0.0015) model time 0.5724 (0.5822) loss 7.0775 (7.3646) grad_norm 1.5199 (2.4351) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:34:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][580/625] eta 0:00:26 lr 0.000474 wd 0.0500 time 0.5670 (0.5828) data time 0.0006 (0.0015) model time 0.5664 (0.5821) loss 6.8548 (7.3610) grad_norm 2.6386 (2.4445) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:34:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][590/625] eta 0:00:20 lr 0.000474 wd 0.0500 time 0.5742 (0.5827) data time 0.0007 (0.0015) model time 0.5735 (0.5819) loss 8.5007 (7.3673) grad_norm 1.9996 (2.4427) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:35:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][600/625] eta 0:00:14 lr 0.000474 wd 0.0500 time 0.5682 (0.5827) data time 0.0008 (0.0014) model time 0.5674 (0.5820) loss 8.3860 (7.3649) grad_norm 3.1707 (2.4401) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:35:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][610/625] eta 0:00:08 lr 0.000474 wd 0.0500 time 0.6722 (0.5828) data time 0.0006 (0.0014) model time 0.6716 (0.5820) loss 7.3265 (7.3622) grad_norm 2.5283 (2.4420) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:35:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [179/300][620/625] eta 0:00:02 lr 0.000474 wd 0.0500 time 0.5672 (0.5828) data time 0.0004 (0.0014) model time 0.5669 (0.5820) loss 8.6153 (7.3711) grad_norm 1.9971 (2.4398) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:35:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 179 training takes 0:06:04 +[2024-07-25 08:35:15 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 08:35:17 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 08:35:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.478 (0.478) Loss 0.5098 (0.5098) Acc@1 89.746 (89.746) Acc@5 98.730 (98.730) Mem 22339MB +[2024-07-25 08:35:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.7983 (0.6326) Acc@1 81.396 (86.643) Acc@5 96.289 (97.825) Mem 22339MB +[2024-07-25 08:35:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9248 (0.7375) Acc@1 77.686 (83.480) Acc@5 95.459 (96.798) Mem 22339MB +[2024-07-25 08:35:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.207 Acc@5 96.783 +[2024-07-25 08:35:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.2% +[2024-07-25 08:35:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.827 (0.827) Loss 0.4985 (0.4985) Acc@1 89.990 (89.990) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-25 08:35:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.190) Loss 0.7559 (0.6221) Acc@1 82.861 (87.025) Acc@5 96.582 (97.914) Mem 22339MB +[2024-07-25 08:35:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.159) Loss 0.8799 (0.7187) Acc@1 78.564 (84.029) Acc@5 95.850 (96.947) Mem 22339MB +[2024-07-25 08:35:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.629 Acc@5 96.941 +[2024-07-25 08:35:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.6% +[2024-07-25 08:35:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.63% +[2024-07-25 08:35:24 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 08:35:26 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 08:35:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][0/625] eta 0:09:29 lr 0.000474 wd 0.0500 time 0.9117 (0.9117) data time 0.3907 (0.3907) model time 0.0000 (0.0000) loss 7.9021 (7.9021) grad_norm 1.7272 (1.7272) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:35:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][10/625] eta 0:07:06 lr 0.000474 wd 0.0500 time 0.7302 (0.6927) data time 0.0008 (0.0364) model time 0.0000 (0.0000) loss 6.2710 (7.1358) grad_norm 2.6552 (2.1951) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:35:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][20/625] eta 0:06:47 lr 0.000474 wd 0.0500 time 0.7028 (0.6736) data time 0.0008 (0.0195) model time 0.0000 (0.0000) loss 6.8101 (7.3743) grad_norm 1.9209 (2.1678) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:35:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][30/625] eta 0:06:30 lr 0.000474 wd 0.0500 time 0.7054 (0.6567) data time 0.0007 (0.0135) model time 0.0000 (0.0000) loss 6.8620 (7.4179) grad_norm 4.3180 (2.2939) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:35:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][40/625] eta 0:06:15 lr 0.000473 wd 0.0500 time 0.5702 (0.6414) data time 0.0008 (0.0104) model time 0.0000 (0.0000) loss 6.8700 (7.4042) grad_norm 2.3655 (2.2540) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:35:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][50/625] eta 0:06:02 lr 0.000473 wd 0.0500 time 0.7240 (0.6310) data time 0.0008 (0.0086) model time 0.0000 (0.0000) loss 6.6611 (7.3658) grad_norm 2.4995 (2.3194) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:36:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][60/625] eta 0:05:51 lr 0.000473 wd 0.0500 time 0.5748 (0.6215) data time 0.0008 (0.0073) model time 0.5740 (0.5720) loss 9.0302 (7.3422) grad_norm 4.1140 (2.3646) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:36:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][70/625] eta 0:05:41 lr 0.000473 wd 0.0500 time 0.5725 (0.6148) data time 0.0007 (0.0064) model time 0.5717 (0.5726) loss 7.4751 (7.4211) grad_norm 2.2981 (2.3973) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:36:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][80/625] eta 0:05:32 lr 0.000473 wd 0.0500 time 0.5716 (0.6098) data time 0.0008 (0.0057) model time 0.5708 (0.5730) loss 8.0664 (7.4093) grad_norm 3.1723 (2.3771) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:36:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][90/625] eta 0:05:24 lr 0.000473 wd 0.0500 time 0.5695 (0.6058) data time 0.0008 (0.0052) model time 0.5687 (0.5729) loss 6.5699 (7.4194) grad_norm 3.6590 (2.4503) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:36:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][100/625] eta 0:05:16 lr 0.000473 wd 0.0500 time 0.5699 (0.6026) data time 0.0006 (0.0047) model time 0.5693 (0.5728) loss 6.9370 (7.3674) grad_norm 4.7191 (2.4788) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:36:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][110/625] eta 0:05:09 lr 0.000473 wd 0.0500 time 0.5742 (0.6002) data time 0.0008 (0.0044) model time 0.5735 (0.5732) loss 7.9820 (7.3822) grad_norm 2.1648 (2.5155) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:36:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][120/625] eta 0:05:02 lr 0.000473 wd 0.0500 time 0.5719 (0.5980) data time 0.0008 (0.0041) model time 0.5711 (0.5732) loss 6.3949 (7.3707) grad_norm 2.6234 (2.5013) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:36:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][130/625] eta 0:04:55 lr 0.000472 wd 0.0500 time 0.5708 (0.5973) data time 0.0006 (0.0038) model time 0.5702 (0.5749) loss 8.3393 (7.4027) grad_norm 2.3998 (2.5175) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:36:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][140/625] eta 0:04:48 lr 0.000472 wd 0.0500 time 0.5751 (0.5956) data time 0.0009 (0.0036) model time 0.5742 (0.5748) loss 7.4466 (7.4023) grad_norm 2.1382 (2.4793) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:36:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][150/625] eta 0:04:42 lr 0.000472 wd 0.0500 time 0.5727 (0.5942) data time 0.0007 (0.0034) model time 0.5721 (0.5746) loss 6.5235 (7.4014) grad_norm 2.3020 (2.4619) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:37:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][160/625] eta 0:04:35 lr 0.000472 wd 0.0500 time 0.5738 (0.5930) data time 0.0009 (0.0033) model time 0.5729 (0.5746) loss 6.6642 (7.4095) grad_norm 3.0513 (2.4460) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:37:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][170/625] eta 0:04:29 lr 0.000472 wd 0.0500 time 0.5747 (0.5920) data time 0.0007 (0.0031) model time 0.5740 (0.5746) loss 7.9223 (7.4210) grad_norm 2.6101 (2.4818) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:37:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][180/625] eta 0:04:22 lr 0.000472 wd 0.0500 time 0.5823 (0.5910) data time 0.0008 (0.0030) model time 0.5815 (0.5745) loss 6.7083 (7.4239) grad_norm 1.8368 (2.5059) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:37:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][190/625] eta 0:04:16 lr 0.000472 wd 0.0500 time 0.5757 (0.5901) data time 0.0008 (0.0029) model time 0.5749 (0.5744) loss 7.1932 (7.4019) grad_norm 3.1270 (2.5094) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:37:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][200/625] eta 0:04:10 lr 0.000472 wd 0.0500 time 0.5757 (0.5894) data time 0.0006 (0.0028) model time 0.5751 (0.5744) loss 7.3522 (7.4054) grad_norm 3.0703 (2.5152) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:37:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][210/625] eta 0:04:04 lr 0.000472 wd 0.0500 time 0.5721 (0.5894) data time 0.0006 (0.0027) model time 0.5716 (0.5752) loss 6.1066 (7.3833) grad_norm 2.8586 (2.5341) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:37:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][220/625] eta 0:03:58 lr 0.000472 wd 0.0500 time 0.5730 (0.5894) data time 0.0008 (0.0026) model time 0.5722 (0.5761) loss 7.5520 (7.3877) grad_norm 1.9785 (2.5444) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:37:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][230/625] eta 0:03:53 lr 0.000471 wd 0.0500 time 0.7371 (0.5920) data time 0.0007 (0.0025) model time 0.7363 (0.5802) loss 8.0928 (7.3891) grad_norm 1.7796 (2.5406) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:37:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][240/625] eta 0:03:48 lr 0.000471 wd 0.0500 time 0.5698 (0.5939) data time 0.0008 (0.0025) model time 0.5690 (0.5831) loss 6.8270 (7.3877) grad_norm 2.2497 (2.5286) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:37:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][250/625] eta 0:03:43 lr 0.000471 wd 0.0500 time 0.7130 (0.5956) data time 0.0005 (0.0024) model time 0.7124 (0.5858) loss 6.3758 (7.3745) grad_norm 4.6306 (2.5325) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:38:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][260/625] eta 0:03:37 lr 0.000471 wd 0.0500 time 0.5714 (0.5958) data time 0.0006 (0.0023) model time 0.5708 (0.5864) loss 7.1526 (7.3781) grad_norm 1.9896 (2.5572) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:38:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][270/625] eta 0:03:31 lr 0.000471 wd 0.0500 time 0.5672 (0.5952) data time 0.0007 (0.0023) model time 0.5665 (0.5860) loss 8.1294 (7.3884) grad_norm 4.2342 (2.5707) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:38:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][280/625] eta 0:03:25 lr 0.000471 wd 0.0500 time 0.5688 (0.5944) data time 0.0006 (0.0022) model time 0.5682 (0.5854) loss 7.5302 (7.3792) grad_norm 2.4963 (2.5717) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:38:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][290/625] eta 0:03:18 lr 0.000471 wd 0.0500 time 0.5733 (0.5938) data time 0.0006 (0.0022) model time 0.5727 (0.5850) loss 8.7872 (7.3831) grad_norm 2.2116 (2.5578) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:38:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][300/625] eta 0:03:12 lr 0.000471 wd 0.0500 time 0.5699 (0.5932) data time 0.0006 (0.0022) model time 0.5693 (0.5847) loss 6.2962 (7.3890) grad_norm 2.1995 (2.5486) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:38:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][310/625] eta 0:03:06 lr 0.000471 wd 0.0500 time 0.5728 (0.5926) data time 0.0006 (0.0021) model time 0.5722 (0.5843) loss 9.1012 (7.3908) grad_norm 1.8784 (2.5363) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:38:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][320/625] eta 0:03:00 lr 0.000470 wd 0.0500 time 0.5704 (0.5920) data time 0.0007 (0.0021) model time 0.5697 (0.5838) loss 8.2302 (7.3848) grad_norm 1.5954 (2.5192) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:38:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][330/625] eta 0:02:54 lr 0.000470 wd 0.0500 time 0.5733 (0.5915) data time 0.0008 (0.0020) model time 0.5725 (0.5834) loss 9.0247 (7.3829) grad_norm 2.7991 (2.5093) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:38:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][340/625] eta 0:02:48 lr 0.000470 wd 0.0500 time 0.5719 (0.5910) data time 0.0006 (0.0020) model time 0.5713 (0.5831) loss 7.8928 (7.3826) grad_norm 2.1728 (2.4999) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:38:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][350/625] eta 0:02:42 lr 0.000470 wd 0.0500 time 0.5756 (0.5910) data time 0.0008 (0.0020) model time 0.5748 (0.5833) loss 7.1527 (7.3862) grad_norm 1.8414 (2.4907) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:38:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][360/625] eta 0:02:36 lr 0.000470 wd 0.0500 time 0.5733 (0.5905) data time 0.0006 (0.0019) model time 0.5727 (0.5830) loss 6.7429 (7.3797) grad_norm 3.7841 (2.4938) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:39:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][370/625] eta 0:02:30 lr 0.000470 wd 0.0500 time 0.5745 (0.5901) data time 0.0008 (0.0019) model time 0.5738 (0.5827) loss 7.0771 (7.3872) grad_norm 2.0857 (2.4877) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:39:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][380/625] eta 0:02:24 lr 0.000470 wd 0.0500 time 0.5720 (0.5897) data time 0.0006 (0.0019) model time 0.5714 (0.5825) loss 6.2149 (7.3800) grad_norm 2.0119 (2.4800) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:39:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][390/625] eta 0:02:18 lr 0.000470 wd 0.0500 time 0.5752 (0.5893) data time 0.0008 (0.0019) model time 0.5744 (0.5822) loss 8.4421 (7.3905) grad_norm 1.9602 (2.5056) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:39:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][400/625] eta 0:02:12 lr 0.000470 wd 0.0500 time 0.5734 (0.5890) data time 0.0008 (0.0018) model time 0.5726 (0.5820) loss 7.9573 (7.3904) grad_norm 2.0399 (2.5035) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:39:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][410/625] eta 0:02:06 lr 0.000470 wd 0.0500 time 0.5727 (0.5887) data time 0.0006 (0.0018) model time 0.5720 (0.5818) loss 7.0908 (7.3896) grad_norm 2.1327 (2.4904) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:39:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][420/625] eta 0:02:00 lr 0.000469 wd 0.0500 time 0.5713 (0.5883) data time 0.0008 (0.0018) model time 0.5705 (0.5815) loss 7.9332 (7.3884) grad_norm 3.3610 (2.4937) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:39:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][430/625] eta 0:01:54 lr 0.000469 wd 0.0500 time 0.6129 (0.5883) data time 0.0006 (0.0018) model time 0.6122 (0.5817) loss 7.9552 (7.4003) grad_norm 2.3310 (2.4954) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:39:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][440/625] eta 0:01:48 lr 0.000469 wd 0.0500 time 0.5734 (0.5880) data time 0.0008 (0.0017) model time 0.5726 (0.5816) loss 7.5277 (7.4012) grad_norm 2.0138 (2.4870) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:39:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][450/625] eta 0:01:43 lr 0.000469 wd 0.0500 time 0.7696 (0.5887) data time 0.0006 (0.0017) model time 0.7690 (0.5824) loss 7.0360 (7.4024) grad_norm 2.0276 (2.4741) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:39:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][460/625] eta 0:01:37 lr 0.000469 wd 0.0500 time 0.5677 (0.5893) data time 0.0009 (0.0017) model time 0.5669 (0.5833) loss 7.3870 (7.3951) grad_norm 1.7795 (2.4657) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:40:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][470/625] eta 0:01:31 lr 0.000469 wd 0.0500 time 0.7075 (0.5903) data time 0.0008 (0.0017) model time 0.7067 (0.5845) loss 6.1472 (7.3935) grad_norm 2.2733 (2.4623) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:40:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][480/625] eta 0:01:25 lr 0.000469 wd 0.0500 time 0.5726 (0.5906) data time 0.0008 (0.0017) model time 0.5718 (0.5850) loss 7.5953 (7.3935) grad_norm 2.5847 (2.4580) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:40:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][490/625] eta 0:01:19 lr 0.000469 wd 0.0500 time 0.5707 (0.5905) data time 0.0008 (0.0017) model time 0.5699 (0.5850) loss 6.1212 (7.3877) grad_norm 2.2401 (2.4584) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:40:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][500/625] eta 0:01:13 lr 0.000469 wd 0.0500 time 0.5722 (0.5902) data time 0.0008 (0.0016) model time 0.5714 (0.5847) loss 8.8808 (7.3879) grad_norm 2.4120 (2.4579) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:40:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][510/625] eta 0:01:07 lr 0.000469 wd 0.0500 time 0.5736 (0.5899) data time 0.0007 (0.0016) model time 0.5729 (0.5845) loss 8.1476 (7.3870) grad_norm 2.1671 (2.4516) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:40:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][520/625] eta 0:01:01 lr 0.000468 wd 0.0500 time 0.5726 (0.5897) data time 0.0007 (0.0016) model time 0.5719 (0.5843) loss 7.6220 (7.3822) grad_norm 2.6428 (2.4559) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:40:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][530/625] eta 0:00:55 lr 0.000468 wd 0.0500 time 0.5772 (0.5894) data time 0.0008 (0.0016) model time 0.5765 (0.5841) loss 8.1178 (7.3855) grad_norm 2.6355 (2.4576) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:40:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][540/625] eta 0:00:50 lr 0.000468 wd 0.0500 time 0.5747 (0.5891) data time 0.0008 (0.0016) model time 0.5738 (0.5839) loss 8.3659 (7.3981) grad_norm 3.4279 (2.4651) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:40:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][550/625] eta 0:00:44 lr 0.000468 wd 0.0500 time 0.5750 (0.5889) data time 0.0007 (0.0016) model time 0.5743 (0.5837) loss 8.2354 (7.4026) grad_norm 1.8254 (2.4694) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:40:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][560/625] eta 0:00:38 lr 0.000468 wd 0.0500 time 0.5638 (0.5886) data time 0.0006 (0.0016) model time 0.5632 (0.5835) loss 7.9990 (7.3994) grad_norm 2.8951 (2.4698) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:41:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][570/625] eta 0:00:32 lr 0.000468 wd 0.0500 time 0.5733 (0.5885) data time 0.0006 (0.0015) model time 0.5727 (0.5834) loss 6.8148 (7.3962) grad_norm 2.0271 (2.4613) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:41:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][580/625] eta 0:00:26 lr 0.000468 wd 0.0500 time 0.5740 (0.5882) data time 0.0006 (0.0015) model time 0.5734 (0.5832) loss 7.6842 (7.3942) grad_norm 3.1416 (2.4534) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:41:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][590/625] eta 0:00:20 lr 0.000468 wd 0.0500 time 0.5744 (0.5880) data time 0.0006 (0.0015) model time 0.5738 (0.5831) loss 5.8837 (7.3996) grad_norm 1.8722 (2.4485) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:41:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][600/625] eta 0:00:14 lr 0.000468 wd 0.0500 time 0.5742 (0.5878) data time 0.0008 (0.0015) model time 0.5734 (0.5829) loss 8.6707 (7.4087) grad_norm 2.9015 (2.4474) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:41:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][610/625] eta 0:00:08 lr 0.000467 wd 0.0500 time 0.5717 (0.5876) data time 0.0004 (0.0015) model time 0.5713 (0.5828) loss 7.2641 (7.4046) grad_norm 2.1383 (2.4464) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:41:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [180/300][620/625] eta 0:00:02 lr 0.000467 wd 0.0500 time 0.5765 (0.5873) data time 0.0005 (0.0015) model time 0.5760 (0.5826) loss 6.8219 (7.4007) grad_norm 1.5007 (2.4501) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:41:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 180 training takes 0:06:07 +[2024-07-25 08:41:33 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 08:41:34 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 08:41:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.465 (0.465) Loss 0.5171 (0.5171) Acc@1 89.697 (89.697) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 08:41:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.156) Loss 0.7861 (0.6357) Acc@1 81.934 (86.714) Acc@5 96.045 (97.798) Mem 22339MB +[2024-07-25 08:41:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9033 (0.7388) Acc@1 78.418 (83.631) Acc@5 95.801 (96.810) Mem 22339MB +[2024-07-25 08:41:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.285 Acc@5 96.791 +[2024-07-25 08:41:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.3% +[2024-07-25 08:41:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.847 (0.847) Loss 0.4993 (0.4993) Acc@1 89.990 (89.990) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-25 08:41:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.191) Loss 0.7549 (0.6219) Acc@1 82.959 (87.043) Acc@5 96.631 (97.918) Mem 22339MB +[2024-07-25 08:41:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.160) Loss 0.8799 (0.7185) Acc@1 78.711 (84.045) Acc@5 95.801 (96.959) Mem 22339MB +[2024-07-25 08:41:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.647 Acc@5 96.945 +[2024-07-25 08:41:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.6% +[2024-07-25 08:41:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.65% +[2024-07-25 08:41:42 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 08:41:43 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 08:41:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][0/625] eta 0:09:24 lr 0.000467 wd 0.0500 time 0.9029 (0.9029) data time 0.3841 (0.3841) model time 0.0000 (0.0000) loss 7.4535 (7.4535) grad_norm 3.9940 (3.9940) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:41:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][10/625] eta 0:06:11 lr 0.000467 wd 0.0500 time 0.5759 (0.6034) data time 0.0008 (0.0357) model time 0.0000 (0.0000) loss 6.2150 (7.3503) grad_norm 2.0593 (2.2096) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:41:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][20/625] eta 0:06:01 lr 0.000467 wd 0.0500 time 0.7375 (0.5970) data time 0.0008 (0.0191) model time 0.0000 (0.0000) loss 7.2025 (7.0959) grad_norm 2.0903 (2.1433) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:42:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][30/625] eta 0:05:52 lr 0.000467 wd 0.0500 time 0.5754 (0.5927) data time 0.0007 (0.0132) model time 0.0000 (0.0000) loss 6.2475 (7.1994) grad_norm 2.3313 (2.1382) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:42:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][40/625] eta 0:05:46 lr 0.000467 wd 0.0500 time 0.7180 (0.5928) data time 0.0008 (0.0102) model time 0.0000 (0.0000) loss 6.2771 (7.2665) grad_norm 2.8205 (2.2465) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:42:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][50/625] eta 0:05:45 lr 0.000467 wd 0.0500 time 0.7672 (0.6011) data time 0.0006 (0.0084) model time 0.0000 (0.0000) loss 6.5031 (7.2451) grad_norm 2.4143 (2.2823) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:42:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][60/625] eta 0:05:40 lr 0.000467 wd 0.0500 time 0.6119 (0.6033) data time 0.0006 (0.0072) model time 0.6113 (0.6138) loss 7.4521 (7.2936) grad_norm 1.5516 (2.3148) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:42:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][70/625] eta 0:05:37 lr 0.000467 wd 0.0500 time 0.5753 (0.6086) data time 0.0006 (0.0063) model time 0.5747 (0.6268) loss 7.6144 (7.3289) grad_norm 3.5356 (2.3818) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:42:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][80/625] eta 0:05:29 lr 0.000467 wd 0.0500 time 0.5692 (0.6053) data time 0.0009 (0.0056) model time 0.5683 (0.6116) loss 8.6338 (7.3468) grad_norm 2.9510 (2.4576) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:42:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][90/625] eta 0:05:22 lr 0.000466 wd 0.0500 time 0.5686 (0.6032) data time 0.0008 (0.0051) model time 0.5678 (0.6051) loss 8.2550 (7.3745) grad_norm 3.5256 (2.5556) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:42:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][100/625] eta 0:05:16 lr 0.000466 wd 0.0500 time 0.5202 (0.6022) data time 0.0007 (0.0047) model time 0.5195 (0.6025) loss 7.4612 (7.3630) grad_norm 3.3677 (2.5951) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:42:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][110/625] eta 0:05:08 lr 0.000466 wd 0.0500 time 0.5729 (0.5996) data time 0.0009 (0.0044) model time 0.5720 (0.5975) loss 9.4329 (7.4065) grad_norm 2.6529 (2.5933) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:42:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][120/625] eta 0:05:01 lr 0.000466 wd 0.0500 time 0.5742 (0.5977) data time 0.0008 (0.0041) model time 0.5734 (0.5943) loss 7.8290 (7.4013) grad_norm 2.8662 (2.5864) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:43:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][130/625] eta 0:04:55 lr 0.000466 wd 0.0500 time 0.5743 (0.5960) data time 0.0006 (0.0038) model time 0.5737 (0.5918) loss 8.0929 (7.3791) grad_norm 2.1805 (2.5685) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:43:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][140/625] eta 0:04:48 lr 0.000466 wd 0.0500 time 0.5728 (0.5944) data time 0.0009 (0.0036) model time 0.5719 (0.5898) loss 5.5260 (7.3820) grad_norm 1.9684 (2.5439) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:43:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][150/625] eta 0:04:41 lr 0.000466 wd 0.0500 time 0.5734 (0.5931) data time 0.0008 (0.0034) model time 0.5726 (0.5882) loss 7.9277 (7.3772) grad_norm 2.2910 (2.5236) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:43:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][160/625] eta 0:04:35 lr 0.000466 wd 0.0500 time 0.5707 (0.5920) data time 0.0008 (0.0033) model time 0.5699 (0.5870) loss 6.1850 (7.3465) grad_norm 3.0431 (2.5111) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:43:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][170/625] eta 0:04:28 lr 0.000466 wd 0.0500 time 0.5724 (0.5911) data time 0.0006 (0.0031) model time 0.5718 (0.5859) loss 8.0851 (7.3716) grad_norm 1.8231 (2.5518) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:43:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][180/625] eta 0:04:22 lr 0.000465 wd 0.0500 time 0.5711 (0.5901) data time 0.0008 (0.0030) model time 0.5703 (0.5849) loss 6.8063 (7.3745) grad_norm 3.0854 (2.5681) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:43:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][190/625] eta 0:04:16 lr 0.000465 wd 0.0500 time 0.5700 (0.5892) data time 0.0006 (0.0029) model time 0.5694 (0.5841) loss 7.5055 (7.4015) grad_norm 3.1861 (2.5598) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:43:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][200/625] eta 0:04:10 lr 0.000465 wd 0.0500 time 0.5635 (0.5885) data time 0.0008 (0.0028) model time 0.5626 (0.5833) loss 7.4728 (7.3802) grad_norm 2.5369 (2.5734) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:43:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][210/625] eta 0:04:03 lr 0.000465 wd 0.0500 time 0.5737 (0.5878) data time 0.0006 (0.0027) model time 0.5730 (0.5827) loss 7.2873 (7.3859) grad_norm 2.0217 (2.5797) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:43:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][220/625] eta 0:03:57 lr 0.000465 wd 0.0500 time 0.5744 (0.5872) data time 0.0007 (0.0026) model time 0.5737 (0.5822) loss 8.0515 (7.3935) grad_norm 2.1649 (2.6231) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:43:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][230/625] eta 0:03:51 lr 0.000465 wd 0.0500 time 0.5730 (0.5867) data time 0.0008 (0.0026) model time 0.5722 (0.5817) loss 6.3736 (7.4043) grad_norm 2.5826 (2.6316) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:44:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][240/625] eta 0:03:45 lr 0.000465 wd 0.0500 time 0.5739 (0.5863) data time 0.0008 (0.0026) model time 0.5731 (0.5813) loss 6.7004 (7.4058) grad_norm 1.6706 (2.6120) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:44:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][250/625] eta 0:03:40 lr 0.000465 wd 0.0500 time 0.5748 (0.5869) data time 0.0006 (0.0025) model time 0.5742 (0.5822) loss 6.7467 (7.4202) grad_norm 2.3442 (2.6027) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:44:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][260/625] eta 0:03:34 lr 0.000465 wd 0.0500 time 0.7038 (0.5873) data time 0.0008 (0.0025) model time 0.7030 (0.5829) loss 6.2897 (7.4032) grad_norm 2.2511 (2.6143) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:44:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][270/625] eta 0:03:29 lr 0.000465 wd 0.0500 time 0.7402 (0.5895) data time 0.0008 (0.0024) model time 0.7394 (0.5858) loss 7.9184 (7.4082) grad_norm 1.9747 (2.6017) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:44:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][280/625] eta 0:03:23 lr 0.000464 wd 0.0500 time 0.5710 (0.5901) data time 0.0006 (0.0024) model time 0.5704 (0.5866) loss 7.3609 (7.4029) grad_norm 2.0363 (2.5814) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:44:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][290/625] eta 0:03:17 lr 0.000464 wd 0.0500 time 0.5723 (0.5908) data time 0.0006 (0.0023) model time 0.5717 (0.5876) loss 6.5406 (7.4004) grad_norm 2.1728 (2.5609) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:44:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][300/625] eta 0:03:12 lr 0.000464 wd 0.0500 time 0.7200 (0.5913) data time 0.0006 (0.0023) model time 0.7194 (0.5882) loss 5.8761 (7.3963) grad_norm 1.9632 (2.5516) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:44:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][310/625] eta 0:03:06 lr 0.000464 wd 0.0500 time 0.5766 (0.5907) data time 0.0009 (0.0022) model time 0.5757 (0.5877) loss 5.5483 (7.3818) grad_norm 1.9294 (2.5468) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:44:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][320/625] eta 0:03:00 lr 0.000464 wd 0.0500 time 0.5711 (0.5904) data time 0.0008 (0.0022) model time 0.5703 (0.5873) loss 6.9103 (7.3806) grad_norm 2.1941 (2.5571) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:44:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][330/625] eta 0:02:54 lr 0.000464 wd 0.0500 time 0.5721 (0.5899) data time 0.0009 (0.0022) model time 0.5712 (0.5868) loss 6.7120 (7.3870) grad_norm 1.8979 (2.5529) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:45:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][340/625] eta 0:02:48 lr 0.000464 wd 0.0500 time 0.5635 (0.5895) data time 0.0007 (0.0021) model time 0.5628 (0.5864) loss 6.0954 (7.3688) grad_norm 3.4204 (2.5427) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:45:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][350/625] eta 0:02:41 lr 0.000464 wd 0.0500 time 0.5748 (0.5890) data time 0.0006 (0.0021) model time 0.5742 (0.5860) loss 6.1738 (7.3657) grad_norm 3.1996 (2.5447) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:45:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][360/625] eta 0:02:35 lr 0.000464 wd 0.0500 time 0.5740 (0.5886) data time 0.0008 (0.0021) model time 0.5732 (0.5856) loss 6.5202 (7.3657) grad_norm 3.9345 (2.5420) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:45:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][370/625] eta 0:02:30 lr 0.000464 wd 0.0500 time 0.5725 (0.5883) data time 0.0009 (0.0020) model time 0.5716 (0.5852) loss 8.5002 (7.3620) grad_norm 2.9090 (2.5361) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:45:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][380/625] eta 0:02:24 lr 0.000463 wd 0.0500 time 0.5685 (0.5879) data time 0.0008 (0.0020) model time 0.5677 (0.5849) loss 8.4482 (7.3613) grad_norm 1.9413 (2.5350) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:45:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][390/625] eta 0:02:18 lr 0.000463 wd 0.0500 time 0.5698 (0.5876) data time 0.0007 (0.0020) model time 0.5691 (0.5845) loss 6.4054 (7.3731) grad_norm 1.8645 (2.5208) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:45:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][400/625] eta 0:02:12 lr 0.000463 wd 0.0500 time 0.5696 (0.5872) data time 0.0006 (0.0019) model time 0.5689 (0.5842) loss 6.7685 (7.3777) grad_norm 1.5845 (2.5121) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:45:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][410/625] eta 0:02:06 lr 0.000463 wd 0.0500 time 0.5716 (0.5869) data time 0.0009 (0.0019) model time 0.5707 (0.5839) loss 7.3154 (7.3790) grad_norm 2.4915 (2.5316) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 08:45:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][420/625] eta 0:02:00 lr 0.000463 wd 0.0500 time 0.5713 (0.5866) data time 0.0006 (0.0019) model time 0.5707 (0.5836) loss 7.5448 (7.3675) grad_norm 1.6341 (2.5308) loss_scale 1024.0000 (520.5131) mem 22339MB +[2024-07-25 08:45:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][430/625] eta 0:01:54 lr 0.000463 wd 0.0500 time 0.5709 (0.5863) data time 0.0006 (0.0019) model time 0.5703 (0.5833) loss 7.2350 (7.3687) grad_norm 1.9477 (2.5372) loss_scale 1024.0000 (532.1949) mem 22339MB +[2024-07-25 08:46:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][440/625] eta 0:01:48 lr 0.000463 wd 0.0500 time 0.5666 (0.5860) data time 0.0008 (0.0018) model time 0.5658 (0.5831) loss 7.7028 (7.3742) grad_norm 1.4663 (2.5247) loss_scale 1024.0000 (543.3469) mem 22339MB +[2024-07-25 08:46:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][450/625] eta 0:01:42 lr 0.000463 wd 0.0500 time 0.5644 (0.5858) data time 0.0006 (0.0018) model time 0.5638 (0.5828) loss 7.6521 (7.3836) grad_norm 1.7473 (2.5149) loss_scale 1024.0000 (554.0044) mem 22339MB +[2024-07-25 08:46:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][460/625] eta 0:01:36 lr 0.000463 wd 0.0500 time 0.5724 (0.5855) data time 0.0007 (0.0018) model time 0.5717 (0.5826) loss 7.7299 (7.3819) grad_norm 2.2551 (2.5261) loss_scale 1024.0000 (564.1996) mem 22339MB +[2024-07-25 08:46:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][470/625] eta 0:01:30 lr 0.000462 wd 0.0500 time 0.5733 (0.5857) data time 0.0006 (0.0018) model time 0.5727 (0.5828) loss 6.7329 (7.3896) grad_norm 2.3671 (2.5250) loss_scale 1024.0000 (573.9618) mem 22339MB +[2024-07-25 08:46:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][480/625] eta 0:01:24 lr 0.000462 wd 0.0500 time 0.5662 (0.5860) data time 0.0009 (0.0018) model time 0.5653 (0.5832) loss 7.2458 (7.3817) grad_norm 2.9336 (2.5202) loss_scale 1024.0000 (583.3181) mem 22339MB +[2024-07-25 08:46:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][490/625] eta 0:01:19 lr 0.000462 wd 0.0500 time 0.7661 (0.5874) data time 0.0006 (0.0017) model time 0.7655 (0.5848) loss 6.8904 (7.3812) grad_norm 2.1335 (2.5138) loss_scale 1024.0000 (592.2933) mem 22339MB +[2024-07-25 08:46:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][500/625] eta 0:01:13 lr 0.000462 wd 0.0500 time 0.5708 (0.5878) data time 0.0006 (0.0017) model time 0.5702 (0.5853) loss 7.4463 (7.3744) grad_norm 2.6043 (2.5175) loss_scale 1024.0000 (600.9102) mem 22339MB +[2024-07-25 08:46:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][510/625] eta 0:01:07 lr 0.000462 wd 0.0500 time 0.7345 (0.5885) data time 0.0009 (0.0017) model time 0.7336 (0.5861) loss 8.7574 (7.3796) grad_norm 1.5935 (2.5120) loss_scale 1024.0000 (609.1898) mem 22339MB +[2024-07-25 08:46:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][520/625] eta 0:01:01 lr 0.000462 wd 0.0500 time 0.5710 (0.5882) data time 0.0006 (0.0017) model time 0.5704 (0.5859) loss 6.4675 (7.3750) grad_norm 3.0941 (2.5184) loss_scale 1024.0000 (617.1516) mem 22339MB +[2024-07-25 08:46:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][530/625] eta 0:00:55 lr 0.000462 wd 0.0500 time 0.5638 (0.5882) data time 0.0007 (0.0017) model time 0.5630 (0.5858) loss 6.3495 (7.3703) grad_norm 2.4252 (2.5130) loss_scale 1024.0000 (624.8136) mem 22339MB +[2024-07-25 08:47:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][540/625] eta 0:00:49 lr 0.000462 wd 0.0500 time 0.5707 (0.5882) data time 0.0006 (0.0017) model time 0.5701 (0.5859) loss 7.3584 (7.3823) grad_norm 3.2112 (2.5085) loss_scale 1024.0000 (632.1922) mem 22339MB +[2024-07-25 08:47:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][550/625] eta 0:00:44 lr 0.000462 wd 0.0500 time 0.5708 (0.5879) data time 0.0006 (0.0016) model time 0.5702 (0.5856) loss 6.5394 (7.3751) grad_norm 1.7943 (2.4994) loss_scale 1024.0000 (639.3031) mem 22339MB +[2024-07-25 08:47:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][560/625] eta 0:00:38 lr 0.000462 wd 0.0500 time 0.5715 (0.5876) data time 0.0008 (0.0016) model time 0.5707 (0.5853) loss 7.6217 (7.3721) grad_norm 2.3474 (2.4889) loss_scale 1024.0000 (646.1604) mem 22339MB +[2024-07-25 08:47:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][570/625] eta 0:00:32 lr 0.000461 wd 0.0500 time 0.5739 (0.5874) data time 0.0008 (0.0016) model time 0.5732 (0.5851) loss 7.3812 (7.3702) grad_norm 2.2029 (2.4849) loss_scale 1024.0000 (652.7776) mem 22339MB +[2024-07-25 08:47:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][580/625] eta 0:00:26 lr 0.000461 wd 0.0500 time 0.5662 (0.5872) data time 0.0006 (0.0016) model time 0.5655 (0.5849) loss 6.4478 (7.3727) grad_norm 1.7110 (2.4788) loss_scale 1024.0000 (659.1670) mem 22339MB +[2024-07-25 08:47:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][590/625] eta 0:00:20 lr 0.000461 wd 0.0500 time 0.5740 (0.5870) data time 0.0008 (0.0016) model time 0.5732 (0.5847) loss 7.0213 (7.3688) grad_norm 2.6120 (2.4988) loss_scale 1024.0000 (665.3401) mem 22339MB +[2024-07-25 08:47:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][600/625] eta 0:00:14 lr 0.000461 wd 0.0500 time 0.5743 (0.5868) data time 0.0008 (0.0016) model time 0.5735 (0.5845) loss 8.4806 (7.3681) grad_norm 2.2023 (2.4975) loss_scale 1024.0000 (671.3078) mem 22339MB +[2024-07-25 08:47:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][610/625] eta 0:00:08 lr 0.000461 wd 0.0500 time 0.5690 (0.5866) data time 0.0006 (0.0016) model time 0.5684 (0.5843) loss 8.4703 (7.3685) grad_norm 1.9200 (2.4962) loss_scale 1024.0000 (677.0802) mem 22339MB +[2024-07-25 08:47:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [181/300][620/625] eta 0:00:02 lr 0.000461 wd 0.0500 time 0.5703 (0.5863) data time 0.0004 (0.0016) model time 0.5699 (0.5841) loss 7.3582 (7.3674) grad_norm 2.1752 (2.4937) loss_scale 1024.0000 (682.6667) mem 22339MB +[2024-07-25 08:47:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 181 training takes 0:06:06 +[2024-07-25 08:47:49 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 08:47:51 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 08:47:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.520 (0.520) Loss 0.5186 (0.5186) Acc@1 89.502 (89.502) Acc@5 98.633 (98.633) Mem 22339MB +[2024-07-25 08:47:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.161) Loss 0.7661 (0.6344) Acc@1 82.568 (86.732) Acc@5 96.875 (97.865) Mem 22339MB +[2024-07-25 08:47:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.144) Loss 0.8906 (0.7357) Acc@1 78.418 (83.780) Acc@5 95.654 (96.863) Mem 22339MB +[2024-07-25 08:47:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.397 Acc@5 96.833 +[2024-07-25 08:47:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.4% +[2024-07-25 08:47:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 83.40% +[2024-07-25 08:47:54 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 08:47:56 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 08:47:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.479 (0.479) Loss 0.4995 (0.4995) Acc@1 89.990 (89.990) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-25 08:47:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.159) Loss 0.7544 (0.6219) Acc@1 83.057 (87.065) Acc@5 96.582 (97.927) Mem 22339MB +[2024-07-25 08:47:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.8799 (0.7182) Acc@1 78.760 (84.077) Acc@5 95.801 (96.977) Mem 22339MB +[2024-07-25 08:47:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.675 Acc@5 96.969 +[2024-07-25 08:47:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.7% +[2024-07-25 08:47:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.68% +[2024-07-25 08:47:59 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 08:48:01 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 08:48:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][0/625] eta 0:09:23 lr 0.000461 wd 0.0500 time 0.9018 (0.9018) data time 0.3838 (0.3838) model time 0.0000 (0.0000) loss 8.3792 (8.3792) grad_norm 1.8723 (1.8723) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:48:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][10/625] eta 0:06:11 lr 0.000461 wd 0.0500 time 0.5747 (0.6043) data time 0.0007 (0.0357) model time 0.0000 (0.0000) loss 8.3038 (7.3146) grad_norm 2.2236 (1.9671) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:48:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][20/625] eta 0:05:57 lr 0.000461 wd 0.0500 time 0.5690 (0.5901) data time 0.0006 (0.0191) model time 0.0000 (0.0000) loss 7.6577 (7.4905) grad_norm 2.0805 (2.2427) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:48:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][30/625] eta 0:05:48 lr 0.000461 wd 0.0500 time 0.5699 (0.5851) data time 0.0007 (0.0133) model time 0.0000 (0.0000) loss 7.5207 (7.3950) grad_norm 1.9525 (2.2614) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:48:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][40/625] eta 0:05:42 lr 0.000460 wd 0.0500 time 0.5739 (0.5862) data time 0.0008 (0.0103) model time 0.0000 (0.0000) loss 6.8747 (7.3952) grad_norm 4.1540 (2.3285) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:48:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][50/625] eta 0:05:36 lr 0.000460 wd 0.0500 time 0.5622 (0.5844) data time 0.0008 (0.0085) model time 0.0000 (0.0000) loss 7.2668 (7.3557) grad_norm 1.8256 (2.2801) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:48:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][60/625] eta 0:05:29 lr 0.000460 wd 0.0500 time 0.5715 (0.5836) data time 0.0006 (0.0073) model time 0.5709 (0.5785) loss 7.0128 (7.3961) grad_norm 2.5551 (2.2756) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:48:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][70/625] eta 0:05:24 lr 0.000460 wd 0.0500 time 0.7296 (0.5852) data time 0.0007 (0.0064) model time 0.7289 (0.5863) loss 7.0360 (7.3943) grad_norm 2.6095 (2.3178) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:48:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][80/625] eta 0:05:19 lr 0.000460 wd 0.0500 time 0.7114 (0.5865) data time 0.0008 (0.0057) model time 0.7106 (0.5892) loss 9.5366 (7.4184) grad_norm 2.1308 (2.2995) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:48:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][90/625] eta 0:05:16 lr 0.000460 wd 0.0500 time 0.7458 (0.5924) data time 0.0008 (0.0052) model time 0.7450 (0.6018) loss 6.8725 (7.4314) grad_norm 2.0221 (2.4125) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:49:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][100/625] eta 0:05:14 lr 0.000460 wd 0.0500 time 0.7294 (0.5981) data time 0.0008 (0.0047) model time 0.7286 (0.6112) loss 8.4762 (7.4529) grad_norm 2.2372 (2.3943) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:49:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][110/625] eta 0:05:08 lr 0.000460 wd 0.0500 time 0.5711 (0.6000) data time 0.0008 (0.0044) model time 0.5703 (0.6124) loss 7.7044 (7.4544) grad_norm 2.5447 (2.3745) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:49:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][120/625] eta 0:05:02 lr 0.000460 wd 0.0500 time 0.5634 (0.5991) data time 0.0008 (0.0041) model time 0.5626 (0.6090) loss 6.1417 (7.4694) grad_norm 3.8494 (2.4079) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:49:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][130/625] eta 0:04:55 lr 0.000460 wd 0.0500 time 0.5715 (0.5972) data time 0.0008 (0.0039) model time 0.5707 (0.6044) loss 7.9170 (7.4748) grad_norm 2.1994 (2.4811) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:49:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][140/625] eta 0:04:48 lr 0.000459 wd 0.0500 time 0.5744 (0.5954) data time 0.0008 (0.0037) model time 0.5737 (0.6007) loss 7.2752 (7.4749) grad_norm 2.8856 (2.5215) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:49:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][150/625] eta 0:04:42 lr 0.000459 wd 0.0500 time 0.5726 (0.5940) data time 0.0008 (0.0035) model time 0.5718 (0.5979) loss 7.6659 (7.4812) grad_norm 2.2347 (2.5302) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:49:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][160/625] eta 0:04:35 lr 0.000459 wd 0.0500 time 0.5698 (0.5927) data time 0.0008 (0.0034) model time 0.5690 (0.5956) loss 7.8310 (7.4568) grad_norm 2.5002 (2.5224) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:49:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][170/625] eta 0:04:29 lr 0.000459 wd 0.0500 time 0.5713 (0.5917) data time 0.0008 (0.0032) model time 0.5706 (0.5938) loss 7.0346 (7.4473) grad_norm 2.0503 (2.5161) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:49:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][180/625] eta 0:04:22 lr 0.000459 wd 0.0500 time 0.5722 (0.5907) data time 0.0008 (0.0031) model time 0.5714 (0.5923) loss 8.9526 (7.4589) grad_norm 4.5408 (2.5376) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:49:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][190/625] eta 0:04:16 lr 0.000459 wd 0.0500 time 0.5698 (0.5898) data time 0.0006 (0.0030) model time 0.5692 (0.5909) loss 7.9795 (7.4667) grad_norm 4.0829 (2.5810) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:49:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][200/625] eta 0:04:10 lr 0.000459 wd 0.0500 time 0.5749 (0.5891) data time 0.0008 (0.0029) model time 0.5741 (0.5897) loss 6.9184 (7.4550) grad_norm 2.1560 (2.5708) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:50:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][210/625] eta 0:04:04 lr 0.000459 wd 0.0500 time 0.5718 (0.5884) data time 0.0007 (0.0028) model time 0.5711 (0.5887) loss 6.5877 (7.4219) grad_norm 2.1591 (2.5473) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:50:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][220/625] eta 0:03:58 lr 0.000459 wd 0.0500 time 0.5746 (0.5877) data time 0.0008 (0.0027) model time 0.5739 (0.5878) loss 6.8225 (7.4132) grad_norm 2.0893 (2.5269) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:50:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][230/625] eta 0:03:51 lr 0.000458 wd 0.0500 time 0.5701 (0.5871) data time 0.0008 (0.0026) model time 0.5693 (0.5870) loss 7.8973 (7.4069) grad_norm 2.2616 (2.5040) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:50:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][240/625] eta 0:03:45 lr 0.000458 wd 0.0500 time 0.5728 (0.5866) data time 0.0007 (0.0025) model time 0.5721 (0.5863) loss 8.3787 (7.3981) grad_norm 1.8973 (2.4879) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:50:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][250/625] eta 0:03:39 lr 0.000458 wd 0.0500 time 0.5775 (0.5862) data time 0.0008 (0.0025) model time 0.5767 (0.5857) loss 7.6516 (7.3964) grad_norm 2.1433 (2.4794) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:50:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][260/625] eta 0:03:34 lr 0.000458 wd 0.0500 time 0.5707 (0.5864) data time 0.0008 (0.0024) model time 0.5699 (0.5859) loss 9.1568 (7.4046) grad_norm 1.9318 (2.4674) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:50:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][270/625] eta 0:03:28 lr 0.000458 wd 0.0500 time 0.5630 (0.5860) data time 0.0007 (0.0023) model time 0.5623 (0.5855) loss 7.0033 (7.4121) grad_norm 3.0079 (2.4593) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:50:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][280/625] eta 0:03:22 lr 0.000458 wd 0.0500 time 0.5712 (0.5858) data time 0.0006 (0.0023) model time 0.5706 (0.5853) loss 6.5544 (7.4116) grad_norm 2.2493 (2.4650) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:50:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][290/625] eta 0:03:16 lr 0.000458 wd 0.0500 time 0.5731 (0.5856) data time 0.0008 (0.0022) model time 0.5722 (0.5849) loss 7.4244 (7.4162) grad_norm 1.7384 (2.4585) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:50:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][300/625] eta 0:03:10 lr 0.000458 wd 0.0500 time 0.5660 (0.5859) data time 0.0008 (0.0022) model time 0.5652 (0.5853) loss 9.1293 (7.4227) grad_norm 1.8677 (2.5094) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:51:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][310/625] eta 0:03:05 lr 0.000458 wd 0.0500 time 0.5656 (0.5876) data time 0.0008 (0.0022) model time 0.5647 (0.5873) loss 7.8307 (7.4307) grad_norm 2.0458 (2.5171) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:51:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][320/625] eta 0:02:59 lr 0.000458 wd 0.0500 time 0.7307 (0.5892) data time 0.0006 (0.0021) model time 0.7301 (0.5892) loss 7.0997 (7.4281) grad_norm 2.6960 (2.5156) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:51:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][330/625] eta 0:02:54 lr 0.000457 wd 0.0500 time 0.5620 (0.5901) data time 0.0008 (0.0021) model time 0.5612 (0.5903) loss 7.9706 (7.4288) grad_norm 20.4705 (2.5713) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:51:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][340/625] eta 0:02:48 lr 0.000457 wd 0.0500 time 0.5758 (0.5898) data time 0.0007 (0.0020) model time 0.5751 (0.5898) loss 8.7767 (7.4231) grad_norm 1.9926 (2.5755) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:51:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][350/625] eta 0:02:42 lr 0.000457 wd 0.0500 time 0.5660 (0.5896) data time 0.0008 (0.0020) model time 0.5652 (0.5896) loss 7.2162 (7.4281) grad_norm 1.8350 (2.5678) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:51:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][360/625] eta 0:02:36 lr 0.000457 wd 0.0500 time 0.5685 (0.5893) data time 0.0008 (0.0020) model time 0.5677 (0.5892) loss 8.1057 (7.4145) grad_norm 1.8472 (2.5624) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:51:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][370/625] eta 0:02:30 lr 0.000457 wd 0.0500 time 0.5659 (0.5891) data time 0.0006 (0.0020) model time 0.5652 (0.5889) loss 7.2906 (7.4028) grad_norm 3.7165 (2.5653) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:51:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][380/625] eta 0:02:24 lr 0.000457 wd 0.0500 time 0.5633 (0.5888) data time 0.0007 (0.0019) model time 0.5627 (0.5886) loss 6.7180 (7.3944) grad_norm 2.3726 (2.5669) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:51:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][390/625] eta 0:02:18 lr 0.000457 wd 0.0500 time 0.5675 (0.5886) data time 0.0006 (0.0019) model time 0.5669 (0.5883) loss 5.5254 (7.3975) grad_norm 3.9888 (2.5868) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:51:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][400/625] eta 0:02:12 lr 0.000457 wd 0.0500 time 0.5633 (0.5884) data time 0.0006 (0.0019) model time 0.5627 (0.5880) loss 6.2183 (7.3919) grad_norm 1.9793 (2.5933) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:52:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][410/625] eta 0:02:06 lr 0.000457 wd 0.0500 time 0.5616 (0.5883) data time 0.0008 (0.0019) model time 0.5608 (0.5878) loss 7.7935 (7.3887) grad_norm 3.3236 (2.5880) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:52:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][420/625] eta 0:02:00 lr 0.000457 wd 0.0500 time 0.5639 (0.5880) data time 0.0008 (0.0019) model time 0.5630 (0.5875) loss 7.1281 (7.3792) grad_norm 2.2902 (2.5791) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:52:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][430/625] eta 0:01:54 lr 0.000456 wd 0.0500 time 0.5731 (0.5877) data time 0.0006 (0.0019) model time 0.5725 (0.5872) loss 5.9445 (7.3744) grad_norm 3.8795 (2.5819) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:52:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][440/625] eta 0:01:48 lr 0.000456 wd 0.0500 time 0.5734 (0.5874) data time 0.0009 (0.0018) model time 0.5725 (0.5869) loss 6.0602 (7.3727) grad_norm 1.8808 (2.5861) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:52:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][450/625] eta 0:01:42 lr 0.000456 wd 0.0500 time 0.5697 (0.5871) data time 0.0007 (0.0018) model time 0.5690 (0.5865) loss 6.4883 (7.3796) grad_norm 1.8580 (2.5810) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:52:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][460/625] eta 0:01:36 lr 0.000456 wd 0.0500 time 0.5712 (0.5869) data time 0.0008 (0.0018) model time 0.5704 (0.5862) loss 7.8813 (7.3814) grad_norm 2.2647 (2.5751) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:52:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][470/625] eta 0:01:30 lr 0.000456 wd 0.0500 time 0.5714 (0.5866) data time 0.0006 (0.0018) model time 0.5708 (0.5859) loss 6.4682 (7.3803) grad_norm 2.0697 (2.5674) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:52:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][480/625] eta 0:01:25 lr 0.000456 wd 0.0500 time 0.5702 (0.5865) data time 0.0008 (0.0018) model time 0.5694 (0.5858) loss 7.4268 (7.3836) grad_norm 3.0119 (2.5707) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:52:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][490/625] eta 0:01:19 lr 0.000456 wd 0.0500 time 0.5739 (0.5862) data time 0.0007 (0.0017) model time 0.5731 (0.5855) loss 7.8309 (7.3803) grad_norm 2.3825 (2.5662) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:52:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][500/625] eta 0:01:13 lr 0.000456 wd 0.0500 time 0.7108 (0.5863) data time 0.0008 (0.0017) model time 0.7099 (0.5856) loss 9.0349 (7.3777) grad_norm 1.9376 (2.5616) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:53:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][510/625] eta 0:01:07 lr 0.000456 wd 0.0500 time 0.5672 (0.5862) data time 0.0008 (0.0017) model time 0.5664 (0.5854) loss 6.3194 (7.3748) grad_norm 3.3002 (2.5559) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:53:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][520/625] eta 0:01:01 lr 0.000455 wd 0.0500 time 0.5599 (0.5869) data time 0.0009 (0.0017) model time 0.5591 (0.5862) loss 7.4437 (7.3738) grad_norm 1.5275 (2.5479) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:53:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][530/625] eta 0:00:55 lr 0.000455 wd 0.0500 time 0.5704 (0.5875) data time 0.0007 (0.0017) model time 0.5697 (0.5869) loss 7.2359 (7.3709) grad_norm 2.3953 (2.5399) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:53:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][540/625] eta 0:00:50 lr 0.000455 wd 0.0500 time 0.7066 (0.5887) data time 0.0006 (0.0017) model time 0.7061 (0.5881) loss 7.4322 (7.3789) grad_norm 1.6592 (2.5277) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:53:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][550/625] eta 0:00:44 lr 0.000455 wd 0.0500 time 0.5717 (0.5889) data time 0.0008 (0.0017) model time 0.5709 (0.5884) loss 7.1749 (7.3879) grad_norm 1.8195 (2.5221) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:53:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][560/625] eta 0:00:38 lr 0.000455 wd 0.0500 time 0.5706 (0.5887) data time 0.0006 (0.0016) model time 0.5700 (0.5882) loss 7.9394 (7.3911) grad_norm 1.9048 (2.5290) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:53:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][570/625] eta 0:00:32 lr 0.000455 wd 0.0500 time 0.5709 (0.5885) data time 0.0006 (0.0016) model time 0.5703 (0.5879) loss 8.1124 (7.3935) grad_norm 3.3086 (2.5300) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:53:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][580/625] eta 0:00:26 lr 0.000455 wd 0.0500 time 0.5730 (0.5882) data time 0.0006 (0.0016) model time 0.5725 (0.5876) loss 7.4965 (7.3894) grad_norm 3.2732 (2.5589) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:53:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][590/625] eta 0:00:20 lr 0.000455 wd 0.0500 time 0.5703 (0.5880) data time 0.0008 (0.0016) model time 0.5695 (0.5874) loss 8.0704 (7.3856) grad_norm 3.1287 (2.5888) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:53:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][600/625] eta 0:00:14 lr 0.000455 wd 0.0500 time 0.5713 (0.5877) data time 0.0006 (0.0016) model time 0.5707 (0.5871) loss 6.7059 (7.3910) grad_norm 1.8412 (2.5848) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:54:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][610/625] eta 0:00:08 lr 0.000455 wd 0.0500 time 0.5703 (0.5875) data time 0.0004 (0.0016) model time 0.5699 (0.5868) loss 7.0251 (7.3920) grad_norm 4.8027 (2.5878) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:54:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [182/300][620/625] eta 0:00:02 lr 0.000454 wd 0.0500 time 0.5691 (0.5873) data time 0.0006 (0.0016) model time 0.5685 (0.5866) loss 7.3803 (7.3941) grad_norm 2.2374 (2.5815) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:54:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 182 training takes 0:06:07 +[2024-07-25 08:54:08 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 08:54:10 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 08:54:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.491 (0.491) Loss 0.5122 (0.5122) Acc@1 89.844 (89.844) Acc@5 98.633 (98.633) Mem 22339MB +[2024-07-25 08:54:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.159) Loss 0.7710 (0.6278) Acc@1 82.422 (86.790) Acc@5 96.436 (97.798) Mem 22339MB +[2024-07-25 08:54:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.143) Loss 0.8794 (0.7337) Acc@1 78.223 (83.740) Acc@5 95.752 (96.824) Mem 22339MB +[2024-07-25 08:54:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.431 Acc@5 96.807 +[2024-07-25 08:54:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.4% +[2024-07-25 08:54:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 83.43% +[2024-07-25 08:54:13 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 08:54:15 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 08:54:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.462 (0.462) Loss 0.5000 (0.5000) Acc@1 89.990 (89.990) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-25 08:54:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.156) Loss 0.7534 (0.6218) Acc@1 83.154 (87.087) Acc@5 96.582 (97.936) Mem 22339MB +[2024-07-25 08:54:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8789 (0.7179) Acc@1 78.711 (84.098) Acc@5 95.850 (96.989) Mem 22339MB +[2024-07-25 08:54:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.705 Acc@5 96.975 +[2024-07-25 08:54:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.7% +[2024-07-25 08:54:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.71% +[2024-07-25 08:54:18 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 08:54:20 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 08:54:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][0/625] eta 0:08:54 lr 0.000454 wd 0.0500 time 0.8556 (0.8556) data time 0.3378 (0.3378) model time 0.0000 (0.0000) loss 7.9324 (7.9324) grad_norm 2.3589 (2.3589) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:54:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][10/625] eta 0:06:08 lr 0.000454 wd 0.0500 time 0.5738 (0.5999) data time 0.0007 (0.0316) model time 0.0000 (0.0000) loss 6.6179 (7.3561) grad_norm 1.9690 (2.1887) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:54:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][20/625] eta 0:05:55 lr 0.000454 wd 0.0500 time 0.5718 (0.5873) data time 0.0008 (0.0170) model time 0.0000 (0.0000) loss 6.1582 (7.2548) grad_norm 2.0016 (2.1765) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:54:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][30/625] eta 0:05:46 lr 0.000454 wd 0.0500 time 0.5644 (0.5828) data time 0.0006 (0.0118) model time 0.0000 (0.0000) loss 8.8184 (7.1971) grad_norm 2.1867 (2.2792) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:54:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][40/625] eta 0:05:39 lr 0.000454 wd 0.0500 time 0.5738 (0.5807) data time 0.0009 (0.0091) model time 0.0000 (0.0000) loss 8.3654 (7.2310) grad_norm 3.1669 (2.3215) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:54:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][50/625] eta 0:05:33 lr 0.000454 wd 0.0500 time 0.5697 (0.5795) data time 0.0008 (0.0075) model time 0.0000 (0.0000) loss 6.9087 (7.1492) grad_norm 3.9031 (2.3928) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:54:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][60/625] eta 0:05:26 lr 0.000454 wd 0.0500 time 0.5735 (0.5787) data time 0.0006 (0.0064) model time 0.5728 (0.5736) loss 6.1827 (7.2192) grad_norm 2.3979 (2.4556) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:55:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][70/625] eta 0:05:20 lr 0.000454 wd 0.0500 time 0.5707 (0.5782) data time 0.0008 (0.0056) model time 0.5700 (0.5740) loss 7.4953 (7.2380) grad_norm 2.4196 (2.4589) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:55:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][80/625] eta 0:05:14 lr 0.000454 wd 0.0500 time 0.5728 (0.5779) data time 0.0008 (0.0050) model time 0.5720 (0.5744) loss 6.0066 (7.2682) grad_norm 2.2149 (2.4495) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:55:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][90/625] eta 0:05:08 lr 0.000453 wd 0.0500 time 0.5692 (0.5775) data time 0.0007 (0.0046) model time 0.5685 (0.5741) loss 6.2477 (7.2687) grad_norm 1.7021 (2.5253) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:55:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][100/625] eta 0:05:03 lr 0.000453 wd 0.0500 time 0.5730 (0.5783) data time 0.0008 (0.0042) model time 0.5722 (0.5762) loss 6.6650 (7.2786) grad_norm 2.4358 (2.5101) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:55:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][110/625] eta 0:04:58 lr 0.000453 wd 0.0500 time 0.6094 (0.5799) data time 0.0008 (0.0039) model time 0.6086 (0.5793) loss 8.7859 (7.3030) grad_norm 2.3154 (2.5327) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:55:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][120/625] eta 0:04:53 lr 0.000453 wd 0.0500 time 0.7435 (0.5819) data time 0.0006 (0.0037) model time 0.7429 (0.5827) loss 6.1231 (7.2753) grad_norm 4.5714 (2.5681) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:55:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][130/625] eta 0:04:50 lr 0.000453 wd 0.0500 time 0.5708 (0.5866) data time 0.0006 (0.0035) model time 0.5702 (0.5902) loss 7.3634 (7.2693) grad_norm 2.0951 (2.5489) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:55:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][140/625] eta 0:04:45 lr 0.000453 wd 0.0500 time 0.5711 (0.5892) data time 0.0009 (0.0033) model time 0.5701 (0.5938) loss 8.3320 (7.2922) grad_norm 2.5154 (2.5346) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:55:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][150/625] eta 0:04:40 lr 0.000453 wd 0.0500 time 0.5704 (0.5896) data time 0.0007 (0.0031) model time 0.5697 (0.5939) loss 6.5078 (7.2937) grad_norm 1.8320 (2.5664) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:55:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][160/625] eta 0:04:34 lr 0.000453 wd 0.0500 time 0.5760 (0.5895) data time 0.0008 (0.0030) model time 0.5753 (0.5932) loss 6.8545 (7.3008) grad_norm 3.6722 (2.5682) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:56:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][170/625] eta 0:04:27 lr 0.000453 wd 0.0500 time 0.5755 (0.5886) data time 0.0006 (0.0028) model time 0.5749 (0.5916) loss 8.0937 (7.2791) grad_norm 2.2839 (2.6450) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:56:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][180/625] eta 0:04:21 lr 0.000453 wd 0.0500 time 0.5753 (0.5878) data time 0.0008 (0.0027) model time 0.5745 (0.5902) loss 8.3752 (7.3116) grad_norm 3.7129 (2.6689) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:56:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][190/625] eta 0:04:15 lr 0.000452 wd 0.0500 time 0.5749 (0.5872) data time 0.0006 (0.0026) model time 0.5743 (0.5891) loss 6.9077 (7.3275) grad_norm 2.7904 (2.6674) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:56:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][200/625] eta 0:04:09 lr 0.000452 wd 0.0500 time 0.5724 (0.5866) data time 0.0007 (0.0025) model time 0.5717 (0.5882) loss 7.7982 (7.3200) grad_norm 1.8446 (2.6585) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:56:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][210/625] eta 0:04:03 lr 0.000452 wd 0.0500 time 0.5734 (0.5864) data time 0.0006 (0.0025) model time 0.5728 (0.5877) loss 8.0513 (7.3247) grad_norm 2.1287 (2.7165) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:56:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][220/625] eta 0:03:57 lr 0.000452 wd 0.0500 time 0.5714 (0.5858) data time 0.0008 (0.0024) model time 0.5706 (0.5868) loss 8.8238 (7.3322) grad_norm 3.5976 (2.7535) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:56:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][230/625] eta 0:03:51 lr 0.000452 wd 0.0500 time 0.5730 (0.5853) data time 0.0010 (0.0023) model time 0.5720 (0.5861) loss 6.7697 (7.3173) grad_norm 1.7881 (2.7649) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:56:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][240/625] eta 0:03:45 lr 0.000452 wd 0.0500 time 0.5732 (0.5849) data time 0.0008 (0.0023) model time 0.5724 (0.5854) loss 7.4766 (7.3104) grad_norm 2.1055 (2.7703) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:56:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][250/625] eta 0:03:39 lr 0.000452 wd 0.0500 time 0.5727 (0.5845) data time 0.0008 (0.0022) model time 0.5719 (0.5849) loss 6.4586 (7.2953) grad_norm 5.5443 (2.7784) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:56:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][260/625] eta 0:03:33 lr 0.000452 wd 0.0500 time 0.5767 (0.5841) data time 0.0007 (0.0022) model time 0.5760 (0.5843) loss 7.9260 (7.3035) grad_norm 3.9086 (2.7828) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:56:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][270/625] eta 0:03:27 lr 0.000452 wd 0.0500 time 0.5703 (0.5838) data time 0.0006 (0.0021) model time 0.5697 (0.5839) loss 7.3875 (7.2951) grad_norm 3.1858 (2.7856) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:57:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][280/625] eta 0:03:21 lr 0.000452 wd 0.0500 time 0.5695 (0.5834) data time 0.0006 (0.0021) model time 0.5689 (0.5834) loss 6.8960 (7.2991) grad_norm 1.9180 (2.7607) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:57:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][290/625] eta 0:03:15 lr 0.000451 wd 0.0500 time 0.5602 (0.5832) data time 0.0008 (0.0021) model time 0.5594 (0.5830) loss 6.9221 (7.2918) grad_norm 1.4310 (2.7347) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:57:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][300/625] eta 0:03:09 lr 0.000451 wd 0.0500 time 0.5718 (0.5829) data time 0.0007 (0.0020) model time 0.5711 (0.5827) loss 8.2840 (7.2914) grad_norm 2.4046 (2.7348) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:57:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][310/625] eta 0:03:03 lr 0.000451 wd 0.0500 time 0.5705 (0.5827) data time 0.0006 (0.0020) model time 0.5698 (0.5824) loss 8.1361 (7.3117) grad_norm 1.9226 (2.7118) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:57:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][320/625] eta 0:02:57 lr 0.000451 wd 0.0500 time 0.6236 (0.5830) data time 0.0008 (0.0020) model time 0.6228 (0.5828) loss 7.9763 (7.3197) grad_norm 2.8604 (2.6995) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:57:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][330/625] eta 0:02:52 lr 0.000451 wd 0.0500 time 0.7355 (0.5835) data time 0.0006 (0.0019) model time 0.7349 (0.5833) loss 8.0288 (7.3224) grad_norm 2.4893 (2.6938) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 08:57:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][340/625] eta 0:02:46 lr 0.000451 wd 0.0500 time 0.7169 (0.5836) data time 0.0008 (0.0019) model time 0.7162 (0.5835) loss 8.2933 (7.3259) grad_norm 3.7812 (inf) loss_scale 512.0000 (1020.9971) mem 22339MB +[2024-07-25 08:57:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][350/625] eta 0:02:41 lr 0.000451 wd 0.0500 time 0.7080 (0.5859) data time 0.0006 (0.0019) model time 0.7074 (0.5860) loss 6.6614 (7.3324) grad_norm 5.5933 (inf) loss_scale 512.0000 (1006.4957) mem 22339MB +[2024-07-25 08:57:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][360/625] eta 0:02:35 lr 0.000451 wd 0.0500 time 0.6175 (0.5869) data time 0.0006 (0.0018) model time 0.6169 (0.5872) loss 7.5804 (7.3289) grad_norm 2.4391 (inf) loss_scale 512.0000 (992.7978) mem 22339MB +[2024-07-25 08:57:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][370/625] eta 0:02:29 lr 0.000451 wd 0.0500 time 0.5681 (0.5875) data time 0.0009 (0.0018) model time 0.5673 (0.5879) loss 6.7788 (7.3362) grad_norm 3.7287 (inf) loss_scale 512.0000 (979.8383) mem 22339MB +[2024-07-25 08:58:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][380/625] eta 0:02:23 lr 0.000450 wd 0.0500 time 0.5731 (0.5874) data time 0.0006 (0.0018) model time 0.5725 (0.5878) loss 7.3279 (7.3395) grad_norm 1.9554 (inf) loss_scale 512.0000 (967.5591) mem 22339MB +[2024-07-25 08:58:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][390/625] eta 0:02:17 lr 0.000450 wd 0.0500 time 0.5716 (0.5872) data time 0.0007 (0.0018) model time 0.5709 (0.5874) loss 6.8532 (7.3339) grad_norm 2.8112 (inf) loss_scale 512.0000 (955.9079) mem 22339MB +[2024-07-25 08:58:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][400/625] eta 0:02:12 lr 0.000450 wd 0.0500 time 0.5726 (0.5868) data time 0.0006 (0.0018) model time 0.5720 (0.5870) loss 7.1045 (7.3285) grad_norm 2.1471 (inf) loss_scale 512.0000 (944.8379) mem 22339MB +[2024-07-25 08:58:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][410/625] eta 0:02:06 lr 0.000450 wd 0.0500 time 0.5720 (0.5866) data time 0.0008 (0.0017) model time 0.5711 (0.5866) loss 8.1210 (7.3316) grad_norm 5.6774 (inf) loss_scale 512.0000 (934.3066) mem 22339MB +[2024-07-25 08:58:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][420/625] eta 0:02:00 lr 0.000450 wd 0.0500 time 0.5746 (0.5863) data time 0.0008 (0.0017) model time 0.5738 (0.5863) loss 5.3715 (7.3253) grad_norm 2.2609 (inf) loss_scale 512.0000 (924.2755) mem 22339MB +[2024-07-25 08:58:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][430/625] eta 0:01:54 lr 0.000450 wd 0.0500 time 0.5641 (0.5861) data time 0.0008 (0.0017) model time 0.5633 (0.5861) loss 7.2858 (7.3188) grad_norm 3.4530 (inf) loss_scale 512.0000 (914.7100) mem 22339MB +[2024-07-25 08:58:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][440/625] eta 0:01:48 lr 0.000450 wd 0.0500 time 0.5732 (0.5859) data time 0.0006 (0.0017) model time 0.5726 (0.5858) loss 7.0911 (7.3164) grad_norm 3.5891 (inf) loss_scale 512.0000 (905.5782) mem 22339MB +[2024-07-25 08:58:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][450/625] eta 0:01:42 lr 0.000450 wd 0.0500 time 0.5713 (0.5857) data time 0.0008 (0.0017) model time 0.5705 (0.5856) loss 5.8846 (7.3121) grad_norm 2.5668 (inf) loss_scale 512.0000 (896.8514) mem 22339MB +[2024-07-25 08:58:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][460/625] eta 0:01:36 lr 0.000450 wd 0.0500 time 0.5747 (0.5854) data time 0.0008 (0.0016) model time 0.5739 (0.5853) loss 6.1143 (7.3096) grad_norm 2.4925 (inf) loss_scale 512.0000 (888.5033) mem 22339MB +[2024-07-25 08:58:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][470/625] eta 0:01:30 lr 0.000450 wd 0.0500 time 0.5717 (0.5852) data time 0.0006 (0.0016) model time 0.5711 (0.5850) loss 7.4390 (7.3079) grad_norm 1.9685 (inf) loss_scale 512.0000 (880.5096) mem 22339MB +[2024-07-25 08:59:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][480/625] eta 0:01:24 lr 0.000449 wd 0.0500 time 0.5733 (0.5850) data time 0.0007 (0.0016) model time 0.5726 (0.5847) loss 8.3911 (7.3021) grad_norm 1.9915 (inf) loss_scale 512.0000 (872.8482) mem 22339MB +[2024-07-25 08:59:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][490/625] eta 0:01:18 lr 0.000449 wd 0.0500 time 0.5743 (0.5848) data time 0.0007 (0.0016) model time 0.5736 (0.5845) loss 6.0087 (7.2999) grad_norm 2.1803 (inf) loss_scale 512.0000 (865.4990) mem 22339MB +[2024-07-25 08:59:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][500/625] eta 0:01:13 lr 0.000449 wd 0.0500 time 0.5770 (0.5846) data time 0.0008 (0.0016) model time 0.5762 (0.5843) loss 7.2306 (7.2944) grad_norm 2.0038 (inf) loss_scale 512.0000 (858.4431) mem 22339MB +[2024-07-25 08:59:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][510/625] eta 0:01:07 lr 0.000449 wd 0.0500 time 0.5744 (0.5844) data time 0.0008 (0.0016) model time 0.5736 (0.5841) loss 8.6990 (7.2987) grad_norm 2.4466 (inf) loss_scale 512.0000 (851.6634) mem 22339MB +[2024-07-25 08:59:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][520/625] eta 0:01:01 lr 0.000449 wd 0.0500 time 0.5731 (0.5842) data time 0.0008 (0.0016) model time 0.5724 (0.5839) loss 9.0106 (7.3039) grad_norm 2.2845 (inf) loss_scale 512.0000 (845.1440) mem 22339MB +[2024-07-25 08:59:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][530/625] eta 0:00:55 lr 0.000449 wd 0.0500 time 0.5756 (0.5841) data time 0.0006 (0.0015) model time 0.5750 (0.5837) loss 7.9354 (7.3036) grad_norm 1.9990 (inf) loss_scale 512.0000 (838.8701) mem 22339MB +[2024-07-25 08:59:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][540/625] eta 0:00:49 lr 0.000449 wd 0.0500 time 0.6758 (0.5842) data time 0.0007 (0.0015) model time 0.6750 (0.5838) loss 8.2783 (7.3046) grad_norm 2.8024 (inf) loss_scale 512.0000 (832.8281) mem 22339MB +[2024-07-25 08:59:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][550/625] eta 0:00:43 lr 0.000449 wd 0.0500 time 0.6993 (0.5845) data time 0.0006 (0.0015) model time 0.6987 (0.5841) loss 7.5508 (7.3080) grad_norm 1.8505 (inf) loss_scale 512.0000 (827.0054) mem 22339MB +[2024-07-25 08:59:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][560/625] eta 0:00:38 lr 0.000449 wd 0.0500 time 0.7231 (0.5851) data time 0.0009 (0.0015) model time 0.7223 (0.5848) loss 8.0061 (7.3154) grad_norm 2.0786 (inf) loss_scale 512.0000 (821.3904) mem 22339MB +[2024-07-25 08:59:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][570/625] eta 0:00:32 lr 0.000449 wd 0.0500 time 0.7432 (0.5859) data time 0.0006 (0.0015) model time 0.7426 (0.5856) loss 6.5670 (7.3209) grad_norm 1.8631 (inf) loss_scale 512.0000 (815.9720) mem 22339MB +[2024-07-25 09:00:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][580/625] eta 0:00:26 lr 0.000448 wd 0.0500 time 0.7412 (0.5866) data time 0.0006 (0.0015) model time 0.7405 (0.5863) loss 7.6784 (7.3280) grad_norm 1.7589 (inf) loss_scale 512.0000 (810.7401) mem 22339MB +[2024-07-25 09:00:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][590/625] eta 0:00:20 lr 0.000448 wd 0.0500 time 0.5755 (0.5867) data time 0.0008 (0.0015) model time 0.5747 (0.5864) loss 7.8801 (7.3279) grad_norm 2.6135 (inf) loss_scale 512.0000 (805.6853) mem 22339MB +[2024-07-25 09:00:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][600/625] eta 0:00:14 lr 0.000448 wd 0.0500 time 0.5721 (0.5867) data time 0.0008 (0.0015) model time 0.5713 (0.5865) loss 8.1455 (7.3260) grad_norm 2.3459 (inf) loss_scale 512.0000 (800.7987) mem 22339MB +[2024-07-25 09:00:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][610/625] eta 0:00:08 lr 0.000448 wd 0.0500 time 0.5683 (0.5865) data time 0.0004 (0.0015) model time 0.5679 (0.5862) loss 6.8352 (7.3259) grad_norm 1.9416 (inf) loss_scale 512.0000 (796.0720) mem 22339MB +[2024-07-25 09:00:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [183/300][620/625] eta 0:00:02 lr 0.000448 wd 0.0500 time 0.5622 (0.5863) data time 0.0006 (0.0014) model time 0.5617 (0.5860) loss 6.8133 (7.3262) grad_norm 1.7085 (inf) loss_scale 512.0000 (791.4976) mem 22339MB +[2024-07-25 09:00:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 183 training takes 0:06:06 +[2024-07-25 09:00:26 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 09:00:28 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 09:00:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.479 (0.479) Loss 0.5078 (0.5078) Acc@1 89.893 (89.893) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-25 09:00:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.7935 (0.6364) Acc@1 81.885 (86.927) Acc@5 95.898 (97.865) Mem 22339MB +[2024-07-25 09:00:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9136 (0.7364) Acc@1 77.100 (83.882) Acc@5 95.654 (96.896) Mem 22339MB +[2024-07-25 09:00:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.501 Acc@5 96.859 +[2024-07-25 09:00:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.5% +[2024-07-25 09:00:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 83.50% +[2024-07-25 09:00:31 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 09:00:33 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 09:00:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.473 (0.473) Loss 0.4995 (0.4995) Acc@1 89.990 (89.990) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-25 09:00:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.7524 (0.6216) Acc@1 83.008 (87.114) Acc@5 96.631 (97.949) Mem 22339MB +[2024-07-25 09:00:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.8794 (0.7178) Acc@1 78.711 (84.126) Acc@5 95.898 (96.996) Mem 22339MB +[2024-07-25 09:00:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.725 Acc@5 96.981 +[2024-07-25 09:00:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.7% +[2024-07-25 09:00:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.73% +[2024-07-25 09:00:36 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 09:00:38 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 09:00:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][0/625] eta 0:09:50 lr 0.000448 wd 0.0500 time 0.9449 (0.9449) data time 0.4281 (0.4281) model time 0.0000 (0.0000) loss 6.9165 (6.9165) grad_norm 2.6092 (2.6092) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:00:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][10/625] eta 0:06:13 lr 0.000448 wd 0.0500 time 0.5716 (0.6073) data time 0.0008 (0.0397) model time 0.0000 (0.0000) loss 7.4218 (7.4996) grad_norm 1.7499 (2.3775) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:00:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][20/625] eta 0:05:57 lr 0.000448 wd 0.0500 time 0.5723 (0.5913) data time 0.0008 (0.0212) model time 0.0000 (0.0000) loss 7.1560 (7.5069) grad_norm 2.2345 (2.2735) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:00:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][30/625] eta 0:05:48 lr 0.000448 wd 0.0500 time 0.5701 (0.5856) data time 0.0008 (0.0146) model time 0.0000 (0.0000) loss 8.1947 (7.5963) grad_norm 2.1201 (2.2834) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:01:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][40/625] eta 0:05:40 lr 0.000448 wd 0.0500 time 0.5690 (0.5827) data time 0.0006 (0.0113) model time 0.0000 (0.0000) loss 6.9226 (7.5327) grad_norm 2.9449 (2.2499) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:01:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][50/625] eta 0:05:34 lr 0.000447 wd 0.0500 time 0.5679 (0.5811) data time 0.0006 (0.0092) model time 0.0000 (0.0000) loss 7.3675 (7.5156) grad_norm 2.9709 (2.2019) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:01:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][60/625] eta 0:05:27 lr 0.000447 wd 0.0500 time 0.5698 (0.5798) data time 0.0008 (0.0079) model time 0.5690 (0.5724) loss 6.6372 (7.4261) grad_norm 1.8455 (2.2510) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:01:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][70/625] eta 0:05:21 lr 0.000447 wd 0.0500 time 0.5742 (0.5790) data time 0.0006 (0.0069) model time 0.5736 (0.5727) loss 5.9841 (7.3692) grad_norm 3.1920 (2.3239) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:01:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][80/625] eta 0:05:15 lr 0.000447 wd 0.0500 time 0.5708 (0.5783) data time 0.0006 (0.0062) model time 0.5702 (0.5728) loss 6.5189 (7.3492) grad_norm 2.4260 (2.3321) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:01:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][90/625] eta 0:05:09 lr 0.000447 wd 0.0500 time 0.5760 (0.5781) data time 0.0008 (0.0056) model time 0.5752 (0.5733) loss 6.9522 (7.3324) grad_norm 2.2446 (2.3851) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:01:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][100/625] eta 0:05:03 lr 0.000447 wd 0.0500 time 0.5737 (0.5778) data time 0.0007 (0.0051) model time 0.5730 (0.5735) loss 6.4116 (7.3075) grad_norm 1.7040 (2.3771) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:01:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][110/625] eta 0:04:57 lr 0.000447 wd 0.0500 time 0.5682 (0.5775) data time 0.0010 (0.0047) model time 0.5672 (0.5736) loss 7.2060 (7.3404) grad_norm 2.1578 (2.3583) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:01:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][120/625] eta 0:04:51 lr 0.000447 wd 0.0500 time 0.5735 (0.5773) data time 0.0006 (0.0044) model time 0.5729 (0.5736) loss 8.5520 (7.3243) grad_norm 2.4492 (2.3430) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:01:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][130/625] eta 0:04:45 lr 0.000447 wd 0.0500 time 0.5734 (0.5771) data time 0.0007 (0.0042) model time 0.5728 (0.5737) loss 6.1719 (7.3352) grad_norm 5.4315 (2.3494) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:01:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][140/625] eta 0:04:40 lr 0.000447 wd 0.0500 time 0.5741 (0.5782) data time 0.0009 (0.0039) model time 0.5732 (0.5757) loss 6.7893 (7.3612) grad_norm 3.9339 (2.3934) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:02:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][150/625] eta 0:04:35 lr 0.000446 wd 0.0500 time 0.5618 (0.5799) data time 0.0008 (0.0037) model time 0.5609 (0.5784) loss 6.7924 (7.3332) grad_norm 1.6297 (2.4003) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:02:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][160/625] eta 0:04:31 lr 0.000446 wd 0.0500 time 0.7677 (0.5841) data time 0.0008 (0.0035) model time 0.7669 (0.5846) loss 6.3992 (7.3276) grad_norm 2.6607 (2.4219) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:02:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][170/625] eta 0:04:27 lr 0.000446 wd 0.0500 time 0.5716 (0.5887) data time 0.0008 (0.0034) model time 0.5708 (0.5911) loss 8.9245 (7.3302) grad_norm 2.5689 (2.4292) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:02:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][180/625] eta 0:04:23 lr 0.000446 wd 0.0500 time 0.5621 (0.5914) data time 0.0008 (0.0032) model time 0.5613 (0.5946) loss 6.4350 (7.3238) grad_norm 1.6287 (2.4202) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:02:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][190/625] eta 0:04:17 lr 0.000446 wd 0.0500 time 0.5728 (0.5911) data time 0.0008 (0.0031) model time 0.5720 (0.5938) loss 8.8367 (7.3173) grad_norm 2.1963 (2.4404) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:02:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][200/625] eta 0:04:10 lr 0.000446 wd 0.0500 time 0.5736 (0.5905) data time 0.0008 (0.0030) model time 0.5729 (0.5928) loss 7.5067 (7.3102) grad_norm 2.8323 (2.4568) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:02:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][210/625] eta 0:04:04 lr 0.000446 wd 0.0500 time 0.5724 (0.5897) data time 0.0007 (0.0029) model time 0.5718 (0.5916) loss 6.2754 (7.3280) grad_norm 1.9960 (2.4898) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:02:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][220/625] eta 0:03:58 lr 0.000446 wd 0.0500 time 0.5726 (0.5890) data time 0.0006 (0.0028) model time 0.5720 (0.5905) loss 6.1967 (7.3410) grad_norm 3.1959 (2.4850) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:02:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][230/625] eta 0:03:52 lr 0.000446 wd 0.0500 time 0.5729 (0.5884) data time 0.0008 (0.0027) model time 0.5721 (0.5896) loss 8.3199 (7.3516) grad_norm 2.4688 (2.4760) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:02:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][240/625] eta 0:03:46 lr 0.000446 wd 0.0500 time 0.5747 (0.5878) data time 0.0006 (0.0026) model time 0.5742 (0.5887) loss 7.0488 (7.3721) grad_norm 2.4960 (2.4905) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:03:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][250/625] eta 0:03:40 lr 0.000445 wd 0.0500 time 0.5724 (0.5872) data time 0.0006 (0.0026) model time 0.5717 (0.5879) loss 6.8739 (7.3706) grad_norm 2.4616 (2.5212) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:03:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][260/625] eta 0:03:34 lr 0.000445 wd 0.0500 time 0.5698 (0.5867) data time 0.0007 (0.0025) model time 0.5691 (0.5872) loss 6.4804 (7.3767) grad_norm 2.6549 (2.5365) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:03:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][270/625] eta 0:03:28 lr 0.000445 wd 0.0500 time 0.5728 (0.5862) data time 0.0007 (0.0024) model time 0.5720 (0.5865) loss 7.1838 (7.3725) grad_norm 1.5737 (2.5408) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:03:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][280/625] eta 0:03:22 lr 0.000445 wd 0.0500 time 0.5744 (0.5858) data time 0.0006 (0.0024) model time 0.5738 (0.5860) loss 6.8335 (7.3716) grad_norm 3.3485 (2.5415) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:03:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][290/625] eta 0:03:16 lr 0.000445 wd 0.0500 time 0.5734 (0.5854) data time 0.0006 (0.0023) model time 0.5729 (0.5855) loss 6.8028 (7.3762) grad_norm 3.4853 (2.5483) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:03:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][300/625] eta 0:03:10 lr 0.000445 wd 0.0500 time 0.5661 (0.5851) data time 0.0006 (0.0023) model time 0.5655 (0.5851) loss 5.8469 (7.3680) grad_norm 2.8267 (2.5499) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:03:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][310/625] eta 0:03:04 lr 0.000445 wd 0.0500 time 0.5719 (0.5849) data time 0.0006 (0.0022) model time 0.5713 (0.5847) loss 7.4402 (7.3736) grad_norm 2.8621 (2.5498) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:03:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][320/625] eta 0:02:58 lr 0.000445 wd 0.0500 time 0.5728 (0.5846) data time 0.0005 (0.0022) model time 0.5722 (0.5844) loss 8.8716 (7.3778) grad_norm 2.5128 (2.5510) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:03:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][330/625] eta 0:02:52 lr 0.000445 wd 0.0500 time 0.5747 (0.5843) data time 0.0006 (0.0022) model time 0.5741 (0.5840) loss 7.7682 (7.3867) grad_norm 2.5186 (2.5477) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:03:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][340/625] eta 0:02:46 lr 0.000444 wd 0.0500 time 0.5721 (0.5840) data time 0.0006 (0.0021) model time 0.5715 (0.5836) loss 8.0145 (7.3866) grad_norm 2.0626 (2.5378) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:04:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][350/625] eta 0:02:40 lr 0.000444 wd 0.0500 time 0.5711 (0.5837) data time 0.0007 (0.0021) model time 0.5704 (0.5833) loss 6.8664 (7.3720) grad_norm 1.9154 (2.5387) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:04:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][360/625] eta 0:02:34 lr 0.000444 wd 0.0500 time 0.5717 (0.5838) data time 0.0007 (0.0020) model time 0.5710 (0.5834) loss 7.5366 (7.3683) grad_norm 2.0431 (2.5358) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:04:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][370/625] eta 0:02:28 lr 0.000444 wd 0.0500 time 0.5733 (0.5839) data time 0.0006 (0.0020) model time 0.5728 (0.5835) loss 6.1941 (7.3665) grad_norm 2.4648 (2.5332) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:04:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][380/625] eta 0:02:23 lr 0.000444 wd 0.0500 time 0.7476 (0.5853) data time 0.0009 (0.0020) model time 0.7467 (0.5851) loss 7.8596 (7.3741) grad_norm 2.1226 (inf) loss_scale 256.0000 (508.6404) mem 22339MB +[2024-07-25 09:04:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][390/625] eta 0:02:17 lr 0.000444 wd 0.0500 time 0.5633 (0.5862) data time 0.0008 (0.0020) model time 0.5625 (0.5861) loss 6.4127 (7.3713) grad_norm 1.7987 (inf) loss_scale 256.0000 (502.1790) mem 22339MB +[2024-07-25 09:04:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][400/625] eta 0:02:12 lr 0.000444 wd 0.0500 time 0.5632 (0.5871) data time 0.0007 (0.0019) model time 0.5625 (0.5871) loss 7.7366 (7.3633) grad_norm 2.4993 (inf) loss_scale 256.0000 (496.0399) mem 22339MB +[2024-07-25 09:04:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][410/625] eta 0:02:06 lr 0.000444 wd 0.0500 time 0.5705 (0.5872) data time 0.0009 (0.0019) model time 0.5696 (0.5872) loss 8.1347 (7.3694) grad_norm 2.0377 (inf) loss_scale 256.0000 (490.1995) mem 22339MB +[2024-07-25 09:04:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][420/625] eta 0:02:00 lr 0.000444 wd 0.0500 time 0.5728 (0.5871) data time 0.0006 (0.0019) model time 0.5722 (0.5870) loss 7.8084 (7.3674) grad_norm 1.9844 (inf) loss_scale 256.0000 (484.6366) mem 22339MB +[2024-07-25 09:04:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][430/625] eta 0:01:54 lr 0.000444 wd 0.0500 time 0.5717 (0.5868) data time 0.0008 (0.0019) model time 0.5708 (0.5867) loss 8.6646 (7.3716) grad_norm 1.5814 (inf) loss_scale 256.0000 (479.3318) mem 22339MB +[2024-07-25 09:04:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][440/625] eta 0:01:48 lr 0.000443 wd 0.0500 time 0.5734 (0.5865) data time 0.0006 (0.0018) model time 0.5728 (0.5864) loss 6.8751 (7.3571) grad_norm 2.2013 (inf) loss_scale 256.0000 (474.2676) mem 22339MB +[2024-07-25 09:05:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][450/625] eta 0:01:42 lr 0.000443 wd 0.0500 time 0.5740 (0.5863) data time 0.0009 (0.0018) model time 0.5732 (0.5861) loss 8.1514 (7.3650) grad_norm 2.0784 (inf) loss_scale 256.0000 (469.4279) mem 22339MB +[2024-07-25 09:05:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][460/625] eta 0:01:36 lr 0.000443 wd 0.0500 time 0.5711 (0.5860) data time 0.0007 (0.0018) model time 0.5705 (0.5858) loss 7.0103 (7.3600) grad_norm 3.8380 (inf) loss_scale 256.0000 (464.7983) mem 22339MB +[2024-07-25 09:05:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][470/625] eta 0:01:30 lr 0.000443 wd 0.0500 time 0.5699 (0.5858) data time 0.0008 (0.0018) model time 0.5691 (0.5855) loss 6.7205 (7.3769) grad_norm 3.4051 (inf) loss_scale 256.0000 (460.3652) mem 22339MB +[2024-07-25 09:05:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][480/625] eta 0:01:24 lr 0.000443 wd 0.0500 time 0.5705 (0.5856) data time 0.0006 (0.0018) model time 0.5699 (0.5852) loss 7.6642 (7.3844) grad_norm 2.1475 (inf) loss_scale 256.0000 (456.1164) mem 22339MB +[2024-07-25 09:05:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][490/625] eta 0:01:19 lr 0.000443 wd 0.0500 time 0.5736 (0.5853) data time 0.0006 (0.0017) model time 0.5730 (0.5849) loss 8.0585 (7.3852) grad_norm 1.9285 (inf) loss_scale 256.0000 (452.0407) mem 22339MB +[2024-07-25 09:05:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][500/625] eta 0:01:13 lr 0.000443 wd 0.0500 time 0.5718 (0.5851) data time 0.0008 (0.0017) model time 0.5709 (0.5847) loss 7.8514 (7.3801) grad_norm 2.9117 (inf) loss_scale 256.0000 (448.1277) mem 22339MB +[2024-07-25 09:05:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][510/625] eta 0:01:07 lr 0.000443 wd 0.0500 time 0.5700 (0.5849) data time 0.0006 (0.0017) model time 0.5694 (0.5844) loss 7.5608 (7.3773) grad_norm 1.9787 (inf) loss_scale 256.0000 (444.3679) mem 22339MB +[2024-07-25 09:05:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][520/625] eta 0:01:01 lr 0.000443 wd 0.0500 time 0.5716 (0.5847) data time 0.0008 (0.0017) model time 0.5708 (0.5842) loss 7.1040 (7.3736) grad_norm 3.2982 (inf) loss_scale 256.0000 (440.7524) mem 22339MB +[2024-07-25 09:05:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][530/625] eta 0:00:55 lr 0.000443 wd 0.0500 time 0.5758 (0.5845) data time 0.0008 (0.0017) model time 0.5750 (0.5840) loss 7.9336 (7.3680) grad_norm 2.6815 (inf) loss_scale 256.0000 (437.2731) mem 22339MB +[2024-07-25 09:05:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][540/625] eta 0:00:49 lr 0.000442 wd 0.0500 time 0.5732 (0.5843) data time 0.0006 (0.0017) model time 0.5726 (0.5838) loss 7.9049 (7.3665) grad_norm 1.8062 (inf) loss_scale 256.0000 (433.9224) mem 22339MB +[2024-07-25 09:06:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][550/625] eta 0:00:43 lr 0.000442 wd 0.0500 time 0.5737 (0.5842) data time 0.0006 (0.0016) model time 0.5731 (0.5836) loss 8.0196 (7.3609) grad_norm 2.2728 (inf) loss_scale 256.0000 (430.6933) mem 22339MB +[2024-07-25 09:06:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][560/625] eta 0:00:37 lr 0.000442 wd 0.0500 time 0.5722 (0.5840) data time 0.0008 (0.0016) model time 0.5714 (0.5834) loss 7.5758 (7.3678) grad_norm 1.7095 (inf) loss_scale 256.0000 (427.5793) mem 22339MB +[2024-07-25 09:06:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][570/625] eta 0:00:32 lr 0.000442 wd 0.0500 time 0.5767 (0.5838) data time 0.0007 (0.0016) model time 0.5759 (0.5833) loss 8.6769 (7.3758) grad_norm 1.6520 (inf) loss_scale 256.0000 (424.5744) mem 22339MB +[2024-07-25 09:06:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][580/625] eta 0:00:26 lr 0.000442 wd 0.0500 time 0.5721 (0.5840) data time 0.0008 (0.0016) model time 0.5713 (0.5834) loss 9.2526 (7.3762) grad_norm 4.0155 (inf) loss_scale 256.0000 (421.6730) mem 22339MB +[2024-07-25 09:06:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][590/625] eta 0:00:20 lr 0.000442 wd 0.0500 time 0.5685 (0.5845) data time 0.0007 (0.0016) model time 0.5677 (0.5840) loss 8.2804 (7.3818) grad_norm 2.0912 (inf) loss_scale 256.0000 (418.8697) mem 22339MB +[2024-07-25 09:06:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][600/625] eta 0:00:14 lr 0.000442 wd 0.0500 time 0.7239 (0.5852) data time 0.0008 (0.0016) model time 0.7232 (0.5847) loss 8.9739 (7.3912) grad_norm 2.3170 (inf) loss_scale 256.0000 (416.1597) mem 22339MB +[2024-07-25 09:06:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][610/625] eta 0:00:08 lr 0.000442 wd 0.0500 time 0.5731 (0.5859) data time 0.0004 (0.0016) model time 0.5727 (0.5855) loss 5.9070 (7.3981) grad_norm 2.5883 (inf) loss_scale 256.0000 (413.5385) mem 22339MB +[2024-07-25 09:06:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [184/300][620/625] eta 0:00:02 lr 0.000442 wd 0.0500 time 0.7112 (0.5865) data time 0.0005 (0.0016) model time 0.7106 (0.5861) loss 6.4203 (7.3957) grad_norm 1.9626 (inf) loss_scale 256.0000 (411.0016) mem 22339MB +[2024-07-25 09:06:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 184 training takes 0:06:06 +[2024-07-25 09:06:44 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 09:06:46 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 09:06:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.487 (0.487) Loss 0.5151 (0.5151) Acc@1 90.088 (90.088) Acc@5 98.730 (98.730) Mem 22339MB +[2024-07-25 09:06:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.158) Loss 0.7803 (0.6372) Acc@1 82.373 (86.910) Acc@5 96.289 (97.892) Mem 22339MB +[2024-07-25 09:06:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.8975 (0.7396) Acc@1 78.467 (83.873) Acc@5 95.801 (96.868) Mem 22339MB +[2024-07-25 09:06:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.517 Acc@5 96.841 +[2024-07-25 09:06:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.5% +[2024-07-25 09:06:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 83.52% +[2024-07-25 09:06:49 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 09:06:51 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 09:06:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.472 (0.472) Loss 0.4993 (0.4993) Acc@1 89.990 (89.990) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-25 09:06:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.7524 (0.6214) Acc@1 83.154 (87.118) Acc@5 96.582 (97.940) Mem 22339MB +[2024-07-25 09:06:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8784 (0.7173) Acc@1 78.711 (84.156) Acc@5 95.898 (96.994) Mem 22339MB +[2024-07-25 09:06:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.759 Acc@5 96.977 +[2024-07-25 09:06:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.8% +[2024-07-25 09:06:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.76% +[2024-07-25 09:06:54 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 09:06:56 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 09:06:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][0/625] eta 0:09:17 lr 0.000442 wd 0.0500 time 0.8913 (0.8913) data time 0.3723 (0.3723) model time 0.0000 (0.0000) loss 7.7700 (7.7700) grad_norm 2.0625 (2.0625) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:07:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][10/625] eta 0:06:27 lr 0.000441 wd 0.0500 time 0.5747 (0.6296) data time 0.0008 (0.0346) model time 0.0000 (0.0000) loss 7.9897 (7.4188) grad_norm 2.6356 (2.1246) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:07:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][20/625] eta 0:06:04 lr 0.000441 wd 0.0500 time 0.5624 (0.6030) data time 0.0009 (0.0186) model time 0.0000 (0.0000) loss 6.8816 (7.4013) grad_norm 6.8556 (2.3706) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:07:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][30/625] eta 0:05:53 lr 0.000441 wd 0.0500 time 0.5724 (0.5935) data time 0.0008 (0.0130) model time 0.0000 (0.0000) loss 8.9729 (7.5252) grad_norm 1.6894 (2.5574) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:07:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][40/625] eta 0:05:44 lr 0.000441 wd 0.0500 time 0.5712 (0.5884) data time 0.0009 (0.0100) model time 0.0000 (0.0000) loss 8.1508 (7.5273) grad_norm 2.4206 (2.8653) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:07:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][50/625] eta 0:05:36 lr 0.000441 wd 0.0500 time 0.5712 (0.5854) data time 0.0006 (0.0082) model time 0.0000 (0.0000) loss 7.7772 (7.4528) grad_norm 2.0772 (2.7676) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:07:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][60/625] eta 0:05:29 lr 0.000441 wd 0.0500 time 0.5724 (0.5835) data time 0.0008 (0.0070) model time 0.5716 (0.5727) loss 7.9989 (7.3820) grad_norm 2.0341 (2.7108) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:07:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][70/625] eta 0:05:23 lr 0.000441 wd 0.0500 time 0.5722 (0.5823) data time 0.0009 (0.0062) model time 0.5713 (0.5732) loss 7.7813 (7.4024) grad_norm 1.7208 (2.7371) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:07:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][80/625] eta 0:05:16 lr 0.000441 wd 0.0500 time 0.5717 (0.5814) data time 0.0006 (0.0055) model time 0.5710 (0.5734) loss 7.1451 (7.3020) grad_norm 1.8718 (2.6620) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:07:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][90/625] eta 0:05:10 lr 0.000441 wd 0.0500 time 0.5623 (0.5804) data time 0.0008 (0.0050) model time 0.5615 (0.5729) loss 8.1853 (7.3007) grad_norm 3.9469 (2.6528) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:07:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][100/625] eta 0:05:04 lr 0.000441 wd 0.0500 time 0.5730 (0.5805) data time 0.0006 (0.0046) model time 0.5724 (0.5745) loss 8.2883 (7.3028) grad_norm 2.0424 (2.6454) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:08:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][110/625] eta 0:04:58 lr 0.000440 wd 0.0500 time 0.5716 (0.5799) data time 0.0007 (0.0043) model time 0.5709 (0.5743) loss 5.2369 (7.2906) grad_norm 2.0746 (2.6039) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:08:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][120/625] eta 0:04:52 lr 0.000440 wd 0.0500 time 0.5738 (0.5795) data time 0.0009 (0.0040) model time 0.5730 (0.5742) loss 8.2408 (7.3258) grad_norm 1.6691 (2.5844) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:08:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][130/625] eta 0:04:46 lr 0.000440 wd 0.0500 time 0.5689 (0.5791) data time 0.0006 (0.0038) model time 0.5683 (0.5741) loss 6.5780 (7.3793) grad_norm 8.7157 (2.6181) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:08:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][140/625] eta 0:04:40 lr 0.000440 wd 0.0500 time 0.5678 (0.5787) data time 0.0007 (0.0036) model time 0.5671 (0.5740) loss 7.8976 (7.3732) grad_norm 1.9885 (2.5982) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:08:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][150/625] eta 0:04:34 lr 0.000440 wd 0.0500 time 0.5736 (0.5785) data time 0.0008 (0.0034) model time 0.5728 (0.5740) loss 8.0359 (7.3967) grad_norm 2.6962 (2.5658) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:08:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][160/625] eta 0:04:28 lr 0.000440 wd 0.0500 time 0.5767 (0.5783) data time 0.0008 (0.0032) model time 0.5759 (0.5742) loss 6.5900 (7.3704) grad_norm 2.9949 (2.5339) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:08:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][170/625] eta 0:04:23 lr 0.000440 wd 0.0500 time 0.5665 (0.5790) data time 0.0007 (0.0031) model time 0.5658 (0.5754) loss 7.8526 (7.3519) grad_norm 2.1528 (2.5497) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:08:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][180/625] eta 0:04:17 lr 0.000440 wd 0.0500 time 0.5743 (0.5797) data time 0.0007 (0.0030) model time 0.5737 (0.5766) loss 8.2760 (7.3829) grad_norm 3.7731 (2.5361) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:08:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][190/625] eta 0:04:13 lr 0.000440 wd 0.0500 time 0.5746 (0.5819) data time 0.0006 (0.0029) model time 0.5739 (0.5797) loss 8.5532 (7.3976) grad_norm 2.0494 (2.5452) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:08:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][200/625] eta 0:04:08 lr 0.000440 wd 0.0500 time 0.5700 (0.5858) data time 0.0006 (0.0028) model time 0.5693 (0.5850) loss 6.7301 (7.3930) grad_norm 2.1995 (2.5440) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:09:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][210/625] eta 0:04:04 lr 0.000439 wd 0.0500 time 0.7341 (0.5881) data time 0.0006 (0.0027) model time 0.7335 (0.5881) loss 7.4150 (7.3895) grad_norm 2.8612 (2.5299) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:09:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][220/625] eta 0:03:58 lr 0.000439 wd 0.0500 time 0.6252 (0.5890) data time 0.0006 (0.0026) model time 0.6245 (0.5892) loss 7.4344 (7.4116) grad_norm 3.6049 (2.5230) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:09:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][230/625] eta 0:03:52 lr 0.000439 wd 0.0500 time 0.5714 (0.5888) data time 0.0006 (0.0025) model time 0.5708 (0.5889) loss 7.3743 (7.4113) grad_norm 2.1416 (2.6742) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:09:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][240/625] eta 0:03:46 lr 0.000439 wd 0.0500 time 0.5684 (0.5882) data time 0.0007 (0.0024) model time 0.5677 (0.5880) loss 6.8374 (7.4085) grad_norm 2.8806 (2.6735) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:09:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][250/625] eta 0:03:40 lr 0.000439 wd 0.0500 time 0.5729 (0.5876) data time 0.0006 (0.0024) model time 0.5722 (0.5873) loss 7.4857 (7.4099) grad_norm 3.9168 (2.7015) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:09:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][260/625] eta 0:03:34 lr 0.000439 wd 0.0500 time 0.5687 (0.5871) data time 0.0007 (0.0023) model time 0.5680 (0.5866) loss 5.9925 (7.4186) grad_norm 1.4343 (2.6929) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:09:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][270/625] eta 0:03:28 lr 0.000439 wd 0.0500 time 0.5719 (0.5867) data time 0.0006 (0.0023) model time 0.5713 (0.5861) loss 7.4127 (7.4039) grad_norm 3.7477 (2.6884) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:09:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][280/625] eta 0:03:22 lr 0.000439 wd 0.0500 time 0.5653 (0.5862) data time 0.0006 (0.0022) model time 0.5647 (0.5855) loss 8.2089 (7.3960) grad_norm 3.1534 (2.6838) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:09:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][290/625] eta 0:03:16 lr 0.000439 wd 0.0500 time 0.5733 (0.5858) data time 0.0008 (0.0022) model time 0.5725 (0.5850) loss 6.6550 (7.3794) grad_norm 2.9450 (2.6921) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:09:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][300/625] eta 0:03:10 lr 0.000438 wd 0.0500 time 0.5611 (0.5854) data time 0.0006 (0.0021) model time 0.5605 (0.5845) loss 6.6744 (7.3759) grad_norm 3.4696 (2.6985) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:09:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][310/625] eta 0:03:04 lr 0.000438 wd 0.0500 time 0.5736 (0.5851) data time 0.0005 (0.0021) model time 0.5731 (0.5841) loss 7.2030 (7.3738) grad_norm 2.1446 (2.7010) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:10:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][320/625] eta 0:02:58 lr 0.000438 wd 0.0500 time 0.5758 (0.5851) data time 0.0006 (0.0021) model time 0.5753 (0.5842) loss 7.6702 (7.3665) grad_norm 1.6456 (2.6955) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:10:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][330/625] eta 0:02:52 lr 0.000438 wd 0.0500 time 0.5718 (0.5848) data time 0.0008 (0.0020) model time 0.5710 (0.5838) loss 6.6814 (7.3619) grad_norm 2.3456 (2.6793) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:10:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][340/625] eta 0:02:46 lr 0.000438 wd 0.0500 time 0.5742 (0.5845) data time 0.0008 (0.0020) model time 0.5734 (0.5835) loss 7.8103 (7.3663) grad_norm 3.0617 (2.6767) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:10:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][350/625] eta 0:02:40 lr 0.000438 wd 0.0500 time 0.5718 (0.5842) data time 0.0007 (0.0020) model time 0.5710 (0.5831) loss 5.8747 (7.3759) grad_norm 2.3638 (2.6610) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:10:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][360/625] eta 0:02:34 lr 0.000438 wd 0.0500 time 0.5693 (0.5840) data time 0.0008 (0.0019) model time 0.5685 (0.5828) loss 8.5720 (7.3698) grad_norm 1.7732 (2.6422) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:10:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][370/625] eta 0:02:28 lr 0.000438 wd 0.0500 time 0.5723 (0.5837) data time 0.0006 (0.0019) model time 0.5716 (0.5826) loss 8.4872 (7.3742) grad_norm 2.3101 (2.6282) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:10:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][380/625] eta 0:02:22 lr 0.000438 wd 0.0500 time 0.5730 (0.5835) data time 0.0008 (0.0019) model time 0.5722 (0.5823) loss 6.9612 (7.3814) grad_norm 1.6502 (2.6206) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:10:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][390/625] eta 0:02:17 lr 0.000438 wd 0.0500 time 0.6283 (0.5835) data time 0.0006 (0.0018) model time 0.6276 (0.5824) loss 7.3133 (7.3833) grad_norm 2.8121 (2.6200) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:10:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][400/625] eta 0:02:11 lr 0.000437 wd 0.0500 time 0.5722 (0.5833) data time 0.0006 (0.0018) model time 0.5716 (0.5821) loss 7.7495 (7.3768) grad_norm 1.8407 (2.6226) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:10:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][410/625] eta 0:02:05 lr 0.000437 wd 0.0500 time 0.7145 (0.5844) data time 0.0006 (0.0018) model time 0.7138 (0.5833) loss 8.6845 (7.3780) grad_norm 2.1843 (2.6146) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:11:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][420/625] eta 0:02:00 lr 0.000437 wd 0.0500 time 0.5694 (0.5863) data time 0.0008 (0.0018) model time 0.5686 (0.5855) loss 8.1013 (7.3733) grad_norm 1.7614 (2.6178) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:11:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][430/625] eta 0:01:54 lr 0.000437 wd 0.0500 time 0.5658 (0.5874) data time 0.0008 (0.0018) model time 0.5650 (0.5868) loss 8.2208 (7.3843) grad_norm 2.5386 (2.6167) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:11:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][440/625] eta 0:01:48 lr 0.000437 wd 0.0500 time 0.6568 (0.5879) data time 0.0006 (0.0017) model time 0.6562 (0.5873) loss 8.7689 (7.3900) grad_norm 2.2516 (2.6201) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:11:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][450/625] eta 0:01:42 lr 0.000437 wd 0.0500 time 0.5733 (0.5877) data time 0.0006 (0.0017) model time 0.5727 (0.5871) loss 8.1192 (7.3871) grad_norm 2.1938 (2.6196) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:11:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][460/625] eta 0:01:36 lr 0.000437 wd 0.0500 time 0.5713 (0.5874) data time 0.0006 (0.0017) model time 0.5707 (0.5867) loss 7.2161 (7.3841) grad_norm 2.3235 (2.6124) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:11:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][470/625] eta 0:01:31 lr 0.000437 wd 0.0500 time 0.5762 (0.5871) data time 0.0006 (0.0017) model time 0.5756 (0.5865) loss 6.3771 (7.3733) grad_norm 1.9911 (2.6053) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:11:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][480/625] eta 0:01:25 lr 0.000437 wd 0.0500 time 0.5698 (0.5869) data time 0.0006 (0.0017) model time 0.5692 (0.5861) loss 7.5654 (7.3756) grad_norm 2.0331 (2.5995) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:11:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][490/625] eta 0:01:19 lr 0.000437 wd 0.0500 time 0.5669 (0.5866) data time 0.0006 (0.0016) model time 0.5664 (0.5858) loss 8.3204 (7.3896) grad_norm 2.4957 (2.5927) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:11:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][500/625] eta 0:01:13 lr 0.000436 wd 0.0500 time 0.5715 (0.5863) data time 0.0008 (0.0016) model time 0.5707 (0.5855) loss 6.1153 (7.3867) grad_norm 1.8079 (2.5898) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:11:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][510/625] eta 0:01:07 lr 0.000436 wd 0.0500 time 0.5624 (0.5861) data time 0.0006 (0.0016) model time 0.5618 (0.5853) loss 6.9810 (7.3850) grad_norm 2.3267 (2.5925) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:12:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][520/625] eta 0:01:01 lr 0.000436 wd 0.0500 time 0.5799 (0.5859) data time 0.0008 (0.0016) model time 0.5791 (0.5850) loss 6.0602 (7.3856) grad_norm 5.3313 (2.6082) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:12:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][530/625] eta 0:00:55 lr 0.000436 wd 0.0500 time 0.7926 (0.5860) data time 0.0008 (0.0016) model time 0.7917 (0.5852) loss 8.8404 (7.3933) grad_norm 2.1144 (2.6082) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:12:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][540/625] eta 0:00:49 lr 0.000436 wd 0.0500 time 0.5733 (0.5857) data time 0.0009 (0.0016) model time 0.5724 (0.5849) loss 7.0513 (7.3871) grad_norm 3.2286 (2.6175) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:12:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][550/625] eta 0:00:43 lr 0.000436 wd 0.0500 time 0.5704 (0.5855) data time 0.0008 (0.0016) model time 0.5696 (0.5846) loss 6.3867 (7.3850) grad_norm 2.6495 (2.6182) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:12:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][560/625] eta 0:00:38 lr 0.000436 wd 0.0500 time 0.5733 (0.5853) data time 0.0008 (0.0015) model time 0.5725 (0.5844) loss 9.3039 (7.3888) grad_norm 2.5666 (2.6140) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:12:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][570/625] eta 0:00:32 lr 0.000436 wd 0.0500 time 0.5700 (0.5851) data time 0.0006 (0.0015) model time 0.5694 (0.5842) loss 5.9796 (7.3883) grad_norm 2.7982 (2.6129) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:12:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][580/625] eta 0:00:26 lr 0.000436 wd 0.0500 time 0.5716 (0.5849) data time 0.0007 (0.0015) model time 0.5709 (0.5840) loss 7.6570 (7.3886) grad_norm 3.5469 (2.6169) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:12:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][590/625] eta 0:00:20 lr 0.000436 wd 0.0500 time 0.5719 (0.5848) data time 0.0008 (0.0015) model time 0.5711 (0.5839) loss 7.7880 (7.3900) grad_norm 2.5474 (2.6186) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:12:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][600/625] eta 0:00:14 lr 0.000435 wd 0.0500 time 0.5736 (0.5846) data time 0.0008 (0.0015) model time 0.5729 (0.5837) loss 8.1759 (7.3937) grad_norm 1.9315 (2.6277) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:12:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][610/625] eta 0:00:08 lr 0.000435 wd 0.0500 time 0.6746 (0.5847) data time 0.0006 (0.0015) model time 0.6740 (0.5837) loss 7.1654 (7.3935) grad_norm 1.7456 (2.6359) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:12:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [185/300][620/625] eta 0:00:02 lr 0.000435 wd 0.0500 time 0.5719 (0.5845) data time 0.0004 (0.0015) model time 0.5715 (0.5835) loss 7.9852 (7.3962) grad_norm 2.3264 (2.6347) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:13:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 185 training takes 0:06:05 +[2024-07-25 09:13:01 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 09:13:03 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 09:13:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.480 (0.480) Loss 0.5049 (0.5049) Acc@1 90.137 (90.137) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 09:13:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.7891 (0.6355) Acc@1 82.764 (86.883) Acc@5 96.533 (97.918) Mem 22339MB +[2024-07-25 09:13:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.143) Loss 0.8916 (0.7340) Acc@1 78.516 (83.947) Acc@5 95.410 (96.952) Mem 22339MB +[2024-07-25 09:13:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.631 Acc@5 96.955 +[2024-07-25 09:13:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.6% +[2024-07-25 09:13:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 83.63% +[2024-07-25 09:13:06 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 09:13:08 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 09:13:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.490 (0.490) Loss 0.4993 (0.4993) Acc@1 90.039 (90.039) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-25 09:13:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.159) Loss 0.7524 (0.6213) Acc@1 83.105 (87.127) Acc@5 96.533 (97.940) Mem 22339MB +[2024-07-25 09:13:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.8779 (0.7169) Acc@1 78.662 (84.154) Acc@5 95.801 (96.996) Mem 22339MB +[2024-07-25 09:13:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.755 Acc@5 96.981 +[2024-07-25 09:13:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.8% +[2024-07-25 09:13:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][0/625] eta 0:15:34 lr 0.000435 wd 0.0500 time 1.4947 (1.4947) data time 0.6082 (0.6082) model time 0.0000 (0.0000) loss 7.8876 (7.8876) grad_norm 2.0481 (2.0481) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:13:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][10/625] eta 0:07:22 lr 0.000435 wd 0.0500 time 0.7725 (0.7195) data time 0.0008 (0.0561) model time 0.0000 (0.0000) loss 6.5098 (6.9832) grad_norm 1.5799 (1.9231) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:13:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][20/625] eta 0:06:42 lr 0.000435 wd 0.0500 time 0.5734 (0.6650) data time 0.0008 (0.0298) model time 0.0000 (0.0000) loss 7.7038 (7.1827) grad_norm 1.6759 (1.9639) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:13:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][30/625] eta 0:06:27 lr 0.000435 wd 0.0500 time 0.5848 (0.6515) data time 0.0008 (0.0205) model time 0.0000 (0.0000) loss 7.9020 (7.3449) grad_norm 1.7538 (2.0027) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:13:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][40/625] eta 0:06:11 lr 0.000435 wd 0.0500 time 0.5758 (0.6359) data time 0.0006 (0.0157) model time 0.0000 (0.0000) loss 7.7600 (7.4157) grad_norm 3.6551 (2.3466) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:13:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][50/625] eta 0:06:00 lr 0.000435 wd 0.0500 time 0.5751 (0.6266) data time 0.0008 (0.0128) model time 0.0000 (0.0000) loss 8.4517 (7.4838) grad_norm 3.1619 (2.5559) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:13:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][60/625] eta 0:05:49 lr 0.000435 wd 0.0500 time 0.5776 (0.6181) data time 0.0006 (0.0108) model time 0.5770 (0.5741) loss 6.1307 (7.5362) grad_norm 2.0928 (2.5129) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:13:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][70/625] eta 0:05:40 lr 0.000434 wd 0.0500 time 0.5760 (0.6142) data time 0.0006 (0.0094) model time 0.5754 (0.5818) loss 5.6375 (7.4612) grad_norm 2.2853 (2.4598) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:14:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][80/625] eta 0:05:32 lr 0.000434 wd 0.0500 time 0.5730 (0.6093) data time 0.0008 (0.0084) model time 0.5723 (0.5788) loss 7.9524 (7.4609) grad_norm 1.6641 (2.4408) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:14:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][90/625] eta 0:05:23 lr 0.000434 wd 0.0500 time 0.5769 (0.6055) data time 0.0008 (0.0076) model time 0.5762 (0.5775) loss 7.1097 (7.4446) grad_norm 2.5868 (2.4428) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:14:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][100/625] eta 0:05:16 lr 0.000434 wd 0.0500 time 0.5789 (0.6025) data time 0.0008 (0.0069) model time 0.5782 (0.5769) loss 5.9773 (7.4215) grad_norm 1.8307 (2.4051) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:14:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][110/625] eta 0:05:08 lr 0.000434 wd 0.0500 time 0.5764 (0.5998) data time 0.0009 (0.0064) model time 0.5756 (0.5761) loss 8.0421 (7.4483) grad_norm 2.5309 (2.3820) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:14:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][120/625] eta 0:05:01 lr 0.000434 wd 0.0500 time 0.5739 (0.5976) data time 0.0008 (0.0059) model time 0.5731 (0.5755) loss 8.3228 (7.4555) grad_norm 1.6838 (2.3717) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:14:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][130/625] eta 0:04:54 lr 0.000434 wd 0.0500 time 0.5769 (0.5959) data time 0.0008 (0.0055) model time 0.5761 (0.5753) loss 8.0726 (7.4519) grad_norm 2.0488 (2.3643) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:14:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][140/625] eta 0:04:48 lr 0.000434 wd 0.0500 time 0.5736 (0.5946) data time 0.0009 (0.0052) model time 0.5727 (0.5756) loss 7.4743 (7.4547) grad_norm 3.4895 (2.3999) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:14:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][150/625] eta 0:04:41 lr 0.000434 wd 0.0500 time 0.5732 (0.5933) data time 0.0008 (0.0049) model time 0.5724 (0.5753) loss 7.6161 (7.4546) grad_norm 3.1570 (2.4529) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:14:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][160/625] eta 0:04:35 lr 0.000434 wd 0.0500 time 0.5748 (0.5921) data time 0.0008 (0.0047) model time 0.5740 (0.5752) loss 6.7860 (7.4344) grad_norm 1.9799 (2.4856) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:14:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][170/625] eta 0:04:28 lr 0.000433 wd 0.0500 time 0.5747 (0.5910) data time 0.0006 (0.0044) model time 0.5741 (0.5750) loss 7.4725 (7.4219) grad_norm 2.3920 (2.4710) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:14:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][180/625] eta 0:04:22 lr 0.000433 wd 0.0500 time 0.5722 (0.5901) data time 0.0006 (0.0042) model time 0.5716 (0.5748) loss 6.9156 (7.4158) grad_norm 2.8513 (2.4652) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:15:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][190/625] eta 0:04:16 lr 0.000433 wd 0.0500 time 0.5754 (0.5892) data time 0.0006 (0.0041) model time 0.5749 (0.5746) loss 7.2686 (7.4155) grad_norm 2.1892 (2.4420) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:15:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][200/625] eta 0:04:10 lr 0.000433 wd 0.0500 time 0.5761 (0.5884) data time 0.0006 (0.0039) model time 0.5754 (0.5745) loss 7.6458 (7.3926) grad_norm 1.7572 (2.4315) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:15:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][210/625] eta 0:04:04 lr 0.000433 wd 0.0500 time 0.5751 (0.5887) data time 0.0006 (0.0038) model time 0.5745 (0.5756) loss 6.8652 (7.4065) grad_norm 2.1508 (2.4234) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:15:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][220/625] eta 0:03:58 lr 0.000433 wd 0.0500 time 0.6106 (0.5887) data time 0.0008 (0.0036) model time 0.6098 (0.5764) loss 8.8923 (7.4250) grad_norm 2.6261 (2.4152) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:15:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][230/625] eta 0:03:53 lr 0.000433 wd 0.0500 time 0.5744 (0.5905) data time 0.0006 (0.0035) model time 0.5737 (0.5794) loss 7.7881 (7.4051) grad_norm 1.7380 (2.4015) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:15:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][240/625] eta 0:03:48 lr 0.000433 wd 0.0500 time 0.5818 (0.5923) data time 0.0008 (0.0034) model time 0.5811 (0.5821) loss 8.0696 (7.4283) grad_norm 2.3710 (2.3922) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:15:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][250/625] eta 0:03:42 lr 0.000433 wd 0.0500 time 0.6764 (0.5928) data time 0.0008 (0.0033) model time 0.6756 (0.5832) loss 5.5982 (7.4104) grad_norm 2.9283 (2.3934) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:15:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][260/625] eta 0:03:36 lr 0.000433 wd 0.0500 time 0.5732 (0.5924) data time 0.0008 (0.0032) model time 0.5724 (0.5832) loss 8.4798 (7.4077) grad_norm 1.9689 (2.4047) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:15:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][270/625] eta 0:03:30 lr 0.000432 wd 0.0500 time 0.5760 (0.5919) data time 0.0008 (0.0031) model time 0.5752 (0.5829) loss 8.2995 (7.4178) grad_norm 2.8581 (2.4068) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:15:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][280/625] eta 0:03:23 lr 0.000432 wd 0.0500 time 0.5768 (0.5913) data time 0.0006 (0.0031) model time 0.5761 (0.5825) loss 6.6191 (7.4174) grad_norm 2.8141 (2.4025) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:16:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][290/625] eta 0:03:18 lr 0.000432 wd 0.0500 time 0.5786 (0.5911) data time 0.0006 (0.0030) model time 0.5780 (0.5826) loss 8.2070 (7.4213) grad_norm 1.8059 (2.4019) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:16:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][300/625] eta 0:03:11 lr 0.000432 wd 0.0500 time 0.5761 (0.5905) data time 0.0008 (0.0029) model time 0.5753 (0.5823) loss 6.6898 (7.4247) grad_norm 2.1723 (2.3986) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:16:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][310/625] eta 0:03:05 lr 0.000432 wd 0.0500 time 0.5741 (0.5900) data time 0.0006 (0.0028) model time 0.5735 (0.5819) loss 9.0482 (7.4149) grad_norm 2.5442 (2.3969) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:16:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][320/625] eta 0:02:59 lr 0.000432 wd 0.0500 time 0.5761 (0.5895) data time 0.0008 (0.0028) model time 0.5753 (0.5816) loss 6.4173 (7.4069) grad_norm 1.8909 (2.4286) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:16:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][330/625] eta 0:02:53 lr 0.000432 wd 0.0500 time 0.5812 (0.5891) data time 0.0008 (0.0027) model time 0.5804 (0.5813) loss 7.3268 (7.4014) grad_norm 4.0243 (2.4327) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:16:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][340/625] eta 0:02:47 lr 0.000432 wd 0.0500 time 0.5893 (0.5887) data time 0.0008 (0.0027) model time 0.5885 (0.5811) loss 8.8865 (7.4070) grad_norm 2.2333 (2.4650) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:16:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][350/625] eta 0:02:41 lr 0.000432 wd 0.0500 time 0.5787 (0.5883) data time 0.0008 (0.0026) model time 0.5780 (0.5809) loss 7.7221 (7.4134) grad_norm 5.6454 (2.4843) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:16:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][360/625] eta 0:02:35 lr 0.000431 wd 0.0500 time 0.5744 (0.5879) data time 0.0008 (0.0026) model time 0.5736 (0.5807) loss 7.2277 (7.4115) grad_norm 2.2442 (2.4910) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:16:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][370/625] eta 0:02:29 lr 0.000431 wd 0.0500 time 0.5776 (0.5876) data time 0.0008 (0.0025) model time 0.5768 (0.5804) loss 8.1770 (7.4192) grad_norm 2.1171 (2.4889) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:16:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][380/625] eta 0:02:23 lr 0.000431 wd 0.0500 time 0.5884 (0.5872) data time 0.0006 (0.0025) model time 0.5878 (0.5803) loss 6.2878 (7.4036) grad_norm 2.1805 (2.5265) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:17:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][390/625] eta 0:02:17 lr 0.000431 wd 0.0500 time 0.5780 (0.5869) data time 0.0008 (0.0024) model time 0.5773 (0.5801) loss 7.9115 (7.4100) grad_norm 3.0408 (2.5253) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:17:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][400/625] eta 0:02:11 lr 0.000431 wd 0.0500 time 0.5738 (0.5866) data time 0.0008 (0.0024) model time 0.5730 (0.5799) loss 7.5446 (7.4126) grad_norm 2.1989 (2.5299) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:17:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][410/625] eta 0:02:06 lr 0.000431 wd 0.0500 time 0.5872 (0.5863) data time 0.0006 (0.0024) model time 0.5865 (0.5797) loss 7.4713 (7.4129) grad_norm 2.2116 (2.5282) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:17:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][420/625] eta 0:02:00 lr 0.000431 wd 0.0500 time 0.5743 (0.5860) data time 0.0006 (0.0023) model time 0.5737 (0.5795) loss 8.4739 (7.4135) grad_norm 3.0901 (2.5233) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:17:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][430/625] eta 0:01:54 lr 0.000431 wd 0.0500 time 0.5769 (0.5860) data time 0.0006 (0.0023) model time 0.5763 (0.5797) loss 7.7172 (7.4130) grad_norm 2.0881 (2.5725) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:17:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][440/625] eta 0:01:48 lr 0.000431 wd 0.0500 time 0.5744 (0.5859) data time 0.0008 (0.0023) model time 0.5737 (0.5796) loss 8.1326 (7.4187) grad_norm 2.5611 (2.6078) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:17:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][450/625] eta 0:01:42 lr 0.000431 wd 0.0500 time 0.7502 (0.5866) data time 0.0008 (0.0023) model time 0.7493 (0.5806) loss 8.4438 (7.4232) grad_norm 2.9096 (2.6038) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:17:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][460/625] eta 0:01:36 lr 0.000430 wd 0.0500 time 0.7451 (0.5874) data time 0.0009 (0.0022) model time 0.7442 (0.5816) loss 7.9443 (7.4243) grad_norm 2.1061 (2.5976) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:17:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][470/625] eta 0:01:31 lr 0.000430 wd 0.0500 time 0.5769 (0.5881) data time 0.0008 (0.0022) model time 0.5760 (0.5825) loss 8.7395 (7.4309) grad_norm 2.7617 (2.5927) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:17:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][480/625] eta 0:01:25 lr 0.000430 wd 0.0500 time 0.5746 (0.5884) data time 0.0006 (0.0022) model time 0.5739 (0.5830) loss 7.3235 (7.4247) grad_norm 19.2785 (2.6190) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:18:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][490/625] eta 0:01:19 lr 0.000430 wd 0.0500 time 0.5719 (0.5882) data time 0.0006 (0.0021) model time 0.5713 (0.5828) loss 6.3111 (7.4130) grad_norm 2.5834 (2.6192) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:18:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][500/625] eta 0:01:13 lr 0.000430 wd 0.0500 time 0.5739 (0.5879) data time 0.0008 (0.0021) model time 0.5731 (0.5826) loss 5.4919 (7.4012) grad_norm 2.4335 (2.6182) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:18:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][510/625] eta 0:01:07 lr 0.000430 wd 0.0500 time 0.5744 (0.5880) data time 0.0006 (0.0021) model time 0.5738 (0.5828) loss 8.1242 (7.4024) grad_norm 2.3092 (2.6164) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:18:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][520/625] eta 0:01:01 lr 0.000430 wd 0.0500 time 0.5731 (0.5877) data time 0.0008 (0.0021) model time 0.5723 (0.5825) loss 6.0146 (7.4046) grad_norm 3.1259 (2.6146) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:18:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][530/625] eta 0:00:55 lr 0.000430 wd 0.0500 time 0.5737 (0.5874) data time 0.0008 (0.0021) model time 0.5729 (0.5823) loss 7.7391 (7.4101) grad_norm 2.2932 (2.6104) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:18:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][540/625] eta 0:00:49 lr 0.000430 wd 0.0500 time 0.5805 (0.5872) data time 0.0009 (0.0020) model time 0.5797 (0.5822) loss 8.4397 (7.4138) grad_norm 3.3433 (2.6016) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:18:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][550/625] eta 0:00:44 lr 0.000430 wd 0.0500 time 0.5716 (0.5869) data time 0.0009 (0.0020) model time 0.5707 (0.5820) loss 7.8864 (7.4243) grad_norm 2.0441 (2.6018) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:18:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][560/625] eta 0:00:38 lr 0.000429 wd 0.0500 time 0.5728 (0.5867) data time 0.0008 (0.0020) model time 0.5720 (0.5818) loss 8.7245 (7.4251) grad_norm 2.1630 (2.5968) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:18:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][570/625] eta 0:00:32 lr 0.000429 wd 0.0500 time 0.5748 (0.5865) data time 0.0008 (0.0020) model time 0.5740 (0.5817) loss 6.9419 (7.4205) grad_norm 1.7933 (2.5953) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:18:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][580/625] eta 0:00:26 lr 0.000429 wd 0.0500 time 0.5844 (0.5863) data time 0.0006 (0.0019) model time 0.5838 (0.5816) loss 7.9263 (7.4193) grad_norm 2.3367 (2.5983) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:18:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][590/625] eta 0:00:20 lr 0.000429 wd 0.0500 time 0.5804 (0.5861) data time 0.0006 (0.0019) model time 0.5798 (0.5814) loss 8.1564 (7.4261) grad_norm 2.1982 (2.6042) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:19:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][600/625] eta 0:00:14 lr 0.000429 wd 0.0500 time 0.5747 (0.5859) data time 0.0006 (0.0019) model time 0.5741 (0.5813) loss 8.8213 (7.4245) grad_norm 1.8618 (2.5959) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:19:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][610/625] eta 0:00:08 lr 0.000429 wd 0.0500 time 0.5736 (0.5858) data time 0.0006 (0.0019) model time 0.5730 (0.5811) loss 7.8604 (7.4173) grad_norm 2.5609 (2.5907) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:19:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [186/300][620/625] eta 0:00:02 lr 0.000429 wd 0.0500 time 0.5789 (0.5856) data time 0.0004 (0.0019) model time 0.5784 (0.5810) loss 6.4126 (7.4171) grad_norm 3.3149 (2.6021) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:19:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 186 training takes 0:06:05 +[2024-07-25 09:19:17 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 09:19:18 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 09:19:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.486 (0.486) Loss 0.5200 (0.5200) Acc@1 89.404 (89.404) Acc@5 98.730 (98.730) Mem 22339MB +[2024-07-25 09:19:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.7812 (0.6387) Acc@1 82.422 (86.825) Acc@5 96.094 (97.825) Mem 22339MB +[2024-07-25 09:19:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.8970 (0.7404) Acc@1 77.686 (83.836) Acc@5 95.703 (96.826) Mem 22339MB +[2024-07-25 09:19:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.429 Acc@5 96.817 +[2024-07-25 09:19:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.4% +[2024-07-25 09:19:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.762 (0.762) Loss 0.4993 (0.4993) Acc@1 90.088 (90.088) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-25 09:19:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.183) Loss 0.7520 (0.6210) Acc@1 83.105 (87.185) Acc@5 96.582 (97.945) Mem 22339MB +[2024-07-25 09:19:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.156) Loss 0.8774 (0.7165) Acc@1 78.613 (84.222) Acc@5 95.850 (97.012) Mem 22339MB +[2024-07-25 09:19:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.817 Acc@5 96.997 +[2024-07-25 09:19:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.8% +[2024-07-25 09:19:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.82% +[2024-07-25 09:19:26 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 09:19:27 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 09:19:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][0/625] eta 0:09:14 lr 0.000429 wd 0.0500 time 0.8875 (0.8875) data time 0.3690 (0.3690) model time 0.0000 (0.0000) loss 8.3533 (8.3533) grad_norm 3.5911 (3.5911) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:19:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][10/625] eta 0:06:10 lr 0.000429 wd 0.0500 time 0.5752 (0.6024) data time 0.0008 (0.0348) model time 0.0000 (0.0000) loss 9.1879 (7.5617) grad_norm 3.1184 (3.1636) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:19:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][20/625] eta 0:06:01 lr 0.000429 wd 0.0500 time 0.6823 (0.5977) data time 0.0006 (0.0187) model time 0.0000 (0.0000) loss 8.3372 (7.5706) grad_norm 2.5818 (3.5649) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:19:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][30/625] eta 0:05:51 lr 0.000428 wd 0.0500 time 0.5668 (0.5902) data time 0.0008 (0.0129) model time 0.0000 (0.0000) loss 7.4971 (7.4705) grad_norm 14.9113 (3.6328) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:19:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][40/625] eta 0:05:47 lr 0.000428 wd 0.0500 time 0.6189 (0.5939) data time 0.0006 (0.0100) model time 0.0000 (0.0000) loss 5.5307 (7.4282) grad_norm 1.9812 (3.4518) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:19:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][50/625] eta 0:05:45 lr 0.000428 wd 0.0500 time 0.7504 (0.6010) data time 0.0008 (0.0082) model time 0.0000 (0.0000) loss 5.9538 (7.4127) grad_norm 2.4722 (3.2698) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:20:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][60/625] eta 0:05:40 lr 0.000428 wd 0.0500 time 0.7403 (0.6029) data time 0.0008 (0.0070) model time 0.7395 (0.6122) loss 7.9947 (7.4172) grad_norm 2.6171 (3.0923) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:20:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][70/625] eta 0:05:34 lr 0.000428 wd 0.0500 time 0.5618 (0.6036) data time 0.0006 (0.0061) model time 0.5611 (0.6094) loss 7.7600 (7.4281) grad_norm 2.4889 (2.9729) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:20:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][80/625] eta 0:05:27 lr 0.000428 wd 0.0500 time 0.5674 (0.6004) data time 0.0008 (0.0055) model time 0.5666 (0.5986) loss 8.2132 (7.4387) grad_norm 2.0730 (2.8999) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:20:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][90/625] eta 0:05:20 lr 0.000428 wd 0.0500 time 0.5697 (0.5989) data time 0.0008 (0.0050) model time 0.5690 (0.5955) loss 8.8810 (7.4806) grad_norm 1.5652 (2.8146) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:20:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][100/625] eta 0:05:13 lr 0.000428 wd 0.0500 time 0.5742 (0.5967) data time 0.0006 (0.0046) model time 0.5736 (0.5916) loss 8.5276 (7.4584) grad_norm 1.5164 (2.7524) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:20:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][110/625] eta 0:05:06 lr 0.000428 wd 0.0500 time 0.5730 (0.5949) data time 0.0006 (0.0042) model time 0.5724 (0.5888) loss 6.7837 (7.4583) grad_norm 2.6280 (2.7027) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:20:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][120/625] eta 0:04:59 lr 0.000428 wd 0.0500 time 0.5725 (0.5932) data time 0.0008 (0.0040) model time 0.5717 (0.5865) loss 6.4946 (7.4311) grad_norm 1.6589 (2.7052) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:20:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][130/625] eta 0:04:52 lr 0.000427 wd 0.0500 time 0.5712 (0.5917) data time 0.0008 (0.0038) model time 0.5704 (0.5849) loss 6.1374 (7.4406) grad_norm 2.5818 (2.6510) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:20:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][140/625] eta 0:04:46 lr 0.000427 wd 0.0500 time 0.5693 (0.5905) data time 0.0006 (0.0035) model time 0.5686 (0.5836) loss 8.9523 (7.4362) grad_norm 1.7788 (2.6273) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:20:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][150/625] eta 0:04:40 lr 0.000427 wd 0.0500 time 0.5742 (0.5895) data time 0.0006 (0.0034) model time 0.5736 (0.5827) loss 6.9473 (7.4048) grad_norm 3.5902 (2.6506) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:21:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][160/625] eta 0:04:33 lr 0.000427 wd 0.0500 time 0.5727 (0.5886) data time 0.0009 (0.0032) model time 0.5717 (0.5819) loss 8.3523 (7.3971) grad_norm 4.0722 (2.6656) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:21:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][170/625] eta 0:04:27 lr 0.000427 wd 0.0500 time 0.5762 (0.5878) data time 0.0008 (0.0031) model time 0.5755 (0.5812) loss 7.9493 (7.3995) grad_norm 1.9465 (2.6327) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:21:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][180/625] eta 0:04:21 lr 0.000427 wd 0.0500 time 0.5736 (0.5871) data time 0.0006 (0.0030) model time 0.5730 (0.5807) loss 8.3181 (7.4289) grad_norm 1.5921 (2.6176) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:21:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][190/625] eta 0:04:15 lr 0.000427 wd 0.0500 time 0.5736 (0.5864) data time 0.0006 (0.0028) model time 0.5730 (0.5802) loss 7.0895 (7.4267) grad_norm 3.5658 (2.6174) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:21:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][200/625] eta 0:04:08 lr 0.000427 wd 0.0500 time 0.5745 (0.5858) data time 0.0006 (0.0027) model time 0.5738 (0.5798) loss 7.0480 (7.4199) grad_norm 1.7715 (2.6312) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:21:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][210/625] eta 0:04:02 lr 0.000427 wd 0.0500 time 0.5729 (0.5853) data time 0.0008 (0.0027) model time 0.5721 (0.5794) loss 8.1718 (7.4366) grad_norm 2.3917 (2.6209) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:21:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][220/625] eta 0:03:56 lr 0.000427 wd 0.0500 time 0.5749 (0.5849) data time 0.0006 (0.0026) model time 0.5743 (0.5791) loss 8.6554 (7.4481) grad_norm 2.4069 (2.6054) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:21:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][230/625] eta 0:03:50 lr 0.000426 wd 0.0500 time 0.5738 (0.5844) data time 0.0008 (0.0025) model time 0.5731 (0.5788) loss 7.7193 (7.4537) grad_norm 2.8139 (2.6115) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:21:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][240/625] eta 0:03:45 lr 0.000426 wd 0.0500 time 0.5677 (0.5845) data time 0.0006 (0.0024) model time 0.5671 (0.5792) loss 7.0521 (7.4241) grad_norm 1.7547 (2.5982) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:21:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][250/625] eta 0:03:39 lr 0.000426 wd 0.0500 time 0.5698 (0.5844) data time 0.0008 (0.0024) model time 0.5690 (0.5793) loss 7.1660 (7.4130) grad_norm 1.8467 (2.5835) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:22:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][260/625] eta 0:03:33 lr 0.000426 wd 0.0500 time 0.7337 (0.5857) data time 0.0006 (0.0023) model time 0.7331 (0.5811) loss 6.8781 (7.4137) grad_norm 2.3356 (2.5615) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:22:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][270/625] eta 0:03:28 lr 0.000426 wd 0.0500 time 0.7392 (0.5885) data time 0.0010 (0.0023) model time 0.7382 (0.5847) loss 7.6808 (7.4066) grad_norm 2.7964 (2.5473) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:22:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][280/625] eta 0:03:23 lr 0.000426 wd 0.0500 time 0.7035 (0.5890) data time 0.0007 (0.0022) model time 0.7027 (0.5855) loss 6.9295 (7.3965) grad_norm 2.6171 (2.5831) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:22:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][290/625] eta 0:03:17 lr 0.000426 wd 0.0500 time 0.5738 (0.5896) data time 0.0006 (0.0022) model time 0.5732 (0.5863) loss 6.9736 (7.4025) grad_norm 4.3343 (2.5903) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:22:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][300/625] eta 0:03:11 lr 0.000426 wd 0.0500 time 0.5746 (0.5898) data time 0.0006 (0.0021) model time 0.5740 (0.5866) loss 7.8058 (7.4068) grad_norm 2.0982 (2.5892) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:22:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][310/625] eta 0:03:05 lr 0.000426 wd 0.0500 time 0.5734 (0.5894) data time 0.0006 (0.0021) model time 0.5728 (0.5862) loss 6.3975 (7.3829) grad_norm 2.7646 (2.5827) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:22:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][320/625] eta 0:02:59 lr 0.000426 wd 0.0500 time 0.5746 (0.5890) data time 0.0008 (0.0020) model time 0.5738 (0.5859) loss 8.4245 (7.3830) grad_norm 2.1519 (2.6168) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:22:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][330/625] eta 0:02:53 lr 0.000425 wd 0.0500 time 0.5731 (0.5885) data time 0.0006 (0.0020) model time 0.5725 (0.5854) loss 6.9729 (7.3937) grad_norm 2.3336 (2.6270) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:22:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][340/625] eta 0:02:47 lr 0.000425 wd 0.0500 time 0.5596 (0.5882) data time 0.0010 (0.0020) model time 0.5586 (0.5851) loss 8.5921 (7.3859) grad_norm 1.8611 (2.6260) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:22:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][350/625] eta 0:02:41 lr 0.000425 wd 0.0500 time 0.5704 (0.5878) data time 0.0006 (0.0020) model time 0.5698 (0.5847) loss 6.2345 (7.3861) grad_norm 3.0321 (2.6197) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:22:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][360/625] eta 0:02:35 lr 0.000425 wd 0.0500 time 0.5723 (0.5876) data time 0.0006 (0.0019) model time 0.5716 (0.5845) loss 7.9650 (7.3885) grad_norm 3.7810 (2.6188) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:23:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][370/625] eta 0:02:29 lr 0.000425 wd 0.0500 time 0.5710 (0.5873) data time 0.0008 (0.0019) model time 0.5703 (0.5842) loss 8.1142 (7.3783) grad_norm 1.7154 (2.6081) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:23:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][380/625] eta 0:02:23 lr 0.000425 wd 0.0500 time 0.5715 (0.5869) data time 0.0008 (0.0019) model time 0.5707 (0.5838) loss 7.0253 (7.3716) grad_norm 2.7078 (2.5982) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:23:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][390/625] eta 0:02:17 lr 0.000425 wd 0.0500 time 0.5721 (0.5866) data time 0.0008 (0.0018) model time 0.5714 (0.5835) loss 7.9958 (7.3643) grad_norm 3.6234 (2.6006) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:23:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][400/625] eta 0:02:11 lr 0.000425 wd 0.0500 time 0.5728 (0.5863) data time 0.0006 (0.0018) model time 0.5722 (0.5833) loss 8.1529 (7.3626) grad_norm 2.3464 (2.5932) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:23:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][410/625] eta 0:02:06 lr 0.000425 wd 0.0500 time 0.5687 (0.5861) data time 0.0006 (0.0018) model time 0.5681 (0.5831) loss 8.2952 (7.3741) grad_norm 2.1197 (2.5910) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:23:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][420/625] eta 0:02:00 lr 0.000425 wd 0.0500 time 0.5744 (0.5858) data time 0.0006 (0.0018) model time 0.5738 (0.5828) loss 7.4280 (7.3754) grad_norm 2.1452 (2.5824) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:23:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][430/625] eta 0:01:54 lr 0.000424 wd 0.0500 time 0.5771 (0.5856) data time 0.0008 (0.0018) model time 0.5763 (0.5826) loss 7.7200 (7.3775) grad_norm 3.1473 (2.6158) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:23:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][440/625] eta 0:01:48 lr 0.000424 wd 0.0500 time 0.5724 (0.5853) data time 0.0008 (0.0017) model time 0.5716 (0.5824) loss 7.7624 (7.3743) grad_norm 2.4337 (2.6206) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:23:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][450/625] eta 0:01:42 lr 0.000424 wd 0.0500 time 0.5728 (0.5851) data time 0.0006 (0.0017) model time 0.5722 (0.5822) loss 7.7321 (7.3699) grad_norm 2.0581 (2.6141) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:23:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][460/625] eta 0:01:36 lr 0.000424 wd 0.0500 time 0.5715 (0.5850) data time 0.0007 (0.0017) model time 0.5709 (0.5821) loss 5.9386 (7.3646) grad_norm 2.2158 (2.6035) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:24:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][470/625] eta 0:01:30 lr 0.000424 wd 0.0500 time 0.5733 (0.5850) data time 0.0006 (0.0017) model time 0.5727 (0.5822) loss 7.5894 (7.3665) grad_norm 1.8983 (2.5972) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:24:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][480/625] eta 0:01:24 lr 0.000424 wd 0.0500 time 0.6770 (0.5858) data time 0.0008 (0.0017) model time 0.6762 (0.5831) loss 8.1460 (7.3696) grad_norm 2.1795 (2.5864) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:24:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][490/625] eta 0:01:19 lr 0.000424 wd 0.0500 time 0.7476 (0.5869) data time 0.0009 (0.0016) model time 0.7467 (0.5844) loss 6.7034 (7.3681) grad_norm 2.1482 (2.5773) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:24:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][500/625] eta 0:01:13 lr 0.000424 wd 0.0500 time 0.5683 (0.5874) data time 0.0008 (0.0016) model time 0.5675 (0.5850) loss 7.4708 (7.3717) grad_norm 1.6990 (2.5702) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 09:24:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][510/625] eta 0:01:07 lr 0.000424 wd 0.0500 time 0.7126 (0.5879) data time 0.0008 (0.0016) model time 0.7118 (0.5856) loss 6.2757 (7.3673) grad_norm 3.1042 (2.5658) loss_scale 512.0000 (261.0098) mem 22339MB +[2024-07-25 09:24:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][520/625] eta 0:01:01 lr 0.000424 wd 0.0500 time 0.5733 (0.5877) data time 0.0009 (0.0016) model time 0.5724 (0.5853) loss 7.8270 (7.3670) grad_norm 2.7011 (2.5640) loss_scale 512.0000 (265.8273) mem 22339MB +[2024-07-25 09:24:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][530/625] eta 0:00:55 lr 0.000423 wd 0.0500 time 0.5757 (0.5877) data time 0.0009 (0.0016) model time 0.5748 (0.5854) loss 7.2224 (7.3675) grad_norm 1.9020 (2.5609) loss_scale 512.0000 (270.4633) mem 22339MB +[2024-07-25 09:24:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][540/625] eta 0:00:49 lr 0.000423 wd 0.0500 time 0.5688 (0.5875) data time 0.0006 (0.0016) model time 0.5681 (0.5852) loss 5.8315 (7.3607) grad_norm 2.0984 (2.5576) loss_scale 512.0000 (274.9279) mem 22339MB +[2024-07-25 09:24:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][550/625] eta 0:00:44 lr 0.000423 wd 0.0500 time 0.5742 (0.5874) data time 0.0006 (0.0016) model time 0.5736 (0.5851) loss 7.0705 (7.3558) grad_norm 1.6507 (2.5544) loss_scale 512.0000 (279.2305) mem 22339MB +[2024-07-25 09:24:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][560/625] eta 0:00:38 lr 0.000423 wd 0.0500 time 0.5710 (0.5872) data time 0.0008 (0.0016) model time 0.5702 (0.5849) loss 6.5306 (7.3566) grad_norm 1.8746 (2.5444) loss_scale 512.0000 (283.3797) mem 22339MB +[2024-07-25 09:25:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][570/625] eta 0:00:32 lr 0.000423 wd 0.0500 time 0.5733 (0.5870) data time 0.0007 (0.0015) model time 0.5727 (0.5847) loss 7.2038 (7.3444) grad_norm 2.2199 (2.5445) loss_scale 512.0000 (287.3835) mem 22339MB +[2024-07-25 09:25:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][580/625] eta 0:00:26 lr 0.000423 wd 0.0500 time 0.5672 (0.5868) data time 0.0009 (0.0015) model time 0.5663 (0.5845) loss 7.0187 (7.3420) grad_norm 1.7325 (2.5357) loss_scale 512.0000 (291.2496) mem 22339MB +[2024-07-25 09:25:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][590/625] eta 0:00:20 lr 0.000423 wd 0.0500 time 0.5729 (0.5866) data time 0.0008 (0.0015) model time 0.5721 (0.5844) loss 7.7214 (7.3416) grad_norm 2.6622 (2.5417) loss_scale 512.0000 (294.9848) mem 22339MB +[2024-07-25 09:25:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][600/625] eta 0:00:14 lr 0.000423 wd 0.0500 time 0.5705 (0.5866) data time 0.0006 (0.0015) model time 0.5699 (0.5844) loss 8.1367 (7.3444) grad_norm 2.3882 (2.5391) loss_scale 512.0000 (298.5957) mem 22339MB +[2024-07-25 09:25:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][610/625] eta 0:00:08 lr 0.000423 wd 0.0500 time 0.5704 (0.5864) data time 0.0004 (0.0015) model time 0.5700 (0.5842) loss 6.7783 (7.3479) grad_norm 3.7568 (2.5369) loss_scale 512.0000 (302.0884) mem 22339MB +[2024-07-25 09:25:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [187/300][620/625] eta 0:00:02 lr 0.000422 wd 0.0500 time 0.5726 (0.5864) data time 0.0005 (0.0015) model time 0.5721 (0.5842) loss 8.9485 (7.3541) grad_norm 2.4438 (2.5342) loss_scale 512.0000 (305.4686) mem 22339MB +[2024-07-25 09:25:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 187 training takes 0:06:06 +[2024-07-25 09:25:34 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 09:25:35 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 09:25:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.489 (0.489) Loss 0.5239 (0.5239) Acc@1 89.746 (89.746) Acc@5 98.828 (98.828) Mem 22339MB +[2024-07-25 09:25:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.159) Loss 0.7773 (0.6416) Acc@1 82.129 (86.847) Acc@5 96.582 (97.852) Mem 22339MB +[2024-07-25 09:25:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.143) Loss 0.8989 (0.7441) Acc@1 79.053 (83.850) Acc@5 95.752 (96.859) Mem 22339MB +[2024-07-25 09:25:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.459 Acc@5 96.855 +[2024-07-25 09:25:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.5% +[2024-07-25 09:25:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.864 (0.864) Loss 0.5000 (0.5000) Acc@1 90.039 (90.039) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-25 09:25:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.193) Loss 0.7510 (0.6212) Acc@1 83.105 (87.211) Acc@5 96.631 (97.940) Mem 22339MB +[2024-07-25 09:25:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.161) Loss 0.8765 (0.7164) Acc@1 78.662 (84.235) Acc@5 95.801 (97.021) Mem 22339MB +[2024-07-25 09:25:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.833 Acc@5 97.007 +[2024-07-25 09:25:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.8% +[2024-07-25 09:25:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.83% +[2024-07-25 09:25:42 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 09:25:44 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 09:25:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][0/625] eta 0:09:15 lr 0.000422 wd 0.0500 time 0.8886 (0.8886) data time 0.3697 (0.3697) model time 0.0000 (0.0000) loss 7.7945 (7.7945) grad_norm 4.8915 (4.8915) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:25:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][10/625] eta 0:06:18 lr 0.000422 wd 0.0500 time 0.5724 (0.6161) data time 0.0006 (0.0354) model time 0.0000 (0.0000) loss 7.3975 (7.4227) grad_norm 3.1268 (3.2572) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:25:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][20/625] eta 0:06:02 lr 0.000422 wd 0.0500 time 0.5665 (0.5989) data time 0.0006 (0.0190) model time 0.0000 (0.0000) loss 8.0104 (7.3995) grad_norm 2.3540 (3.0863) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:26:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][30/625] eta 0:05:52 lr 0.000422 wd 0.0500 time 0.5738 (0.5928) data time 0.0007 (0.0131) model time 0.0000 (0.0000) loss 6.0635 (7.3924) grad_norm 1.9865 (2.8569) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:26:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][40/625] eta 0:05:44 lr 0.000422 wd 0.0500 time 0.5737 (0.5885) data time 0.0007 (0.0101) model time 0.0000 (0.0000) loss 7.5626 (7.2199) grad_norm 2.0359 (2.7021) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:26:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][50/625] eta 0:05:37 lr 0.000422 wd 0.0500 time 0.5706 (0.5865) data time 0.0006 (0.0083) model time 0.0000 (0.0000) loss 8.4045 (7.2479) grad_norm 3.0645 (2.6608) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:26:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][60/625] eta 0:05:31 lr 0.000422 wd 0.0500 time 0.5627 (0.5873) data time 0.0006 (0.0071) model time 0.5621 (0.5904) loss 8.9263 (7.2711) grad_norm 1.8053 (2.5983) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:26:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][70/625] eta 0:05:26 lr 0.000422 wd 0.0500 time 0.5606 (0.5886) data time 0.0006 (0.0063) model time 0.5600 (0.5929) loss 7.9449 (7.2334) grad_norm 1.8543 (2.5312) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:26:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][80/625] eta 0:05:22 lr 0.000422 wd 0.0500 time 0.7331 (0.5914) data time 0.0006 (0.0056) model time 0.7325 (0.5988) loss 7.5744 (7.2346) grad_norm 2.4347 (2.4790) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:26:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][90/625] eta 0:05:19 lr 0.000422 wd 0.0500 time 0.5674 (0.5965) data time 0.0007 (0.0051) model time 0.5667 (0.6082) loss 6.6095 (7.2346) grad_norm 1.8827 (2.5025) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:26:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][100/625] eta 0:05:14 lr 0.000421 wd 0.0500 time 0.7566 (0.5986) data time 0.0008 (0.0047) model time 0.7558 (0.6100) loss 7.7306 (7.2759) grad_norm 1.6454 (2.4729) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:26:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][110/625] eta 0:05:08 lr 0.000421 wd 0.0500 time 0.5725 (0.5999) data time 0.0008 (0.0043) model time 0.5717 (0.6103) loss 7.8387 (7.2606) grad_norm 2.1418 (2.4468) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:26:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][120/625] eta 0:05:02 lr 0.000421 wd 0.0500 time 0.5748 (0.5988) data time 0.0006 (0.0040) model time 0.5741 (0.6069) loss 7.4431 (7.2791) grad_norm 1.7592 (2.4584) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:27:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][130/625] eta 0:04:55 lr 0.000421 wd 0.0500 time 0.5717 (0.5969) data time 0.0006 (0.0038) model time 0.5711 (0.6026) loss 7.8615 (7.2767) grad_norm 2.8284 (2.4496) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:27:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][140/625] eta 0:04:48 lr 0.000421 wd 0.0500 time 0.5711 (0.5953) data time 0.0009 (0.0036) model time 0.5702 (0.5995) loss 7.0462 (7.2800) grad_norm 2.0708 (2.4402) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:27:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][150/625] eta 0:04:42 lr 0.000421 wd 0.0500 time 0.5746 (0.5939) data time 0.0006 (0.0034) model time 0.5740 (0.5967) loss 6.8966 (7.2570) grad_norm 2.2220 (2.4258) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:27:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][160/625] eta 0:04:35 lr 0.000421 wd 0.0500 time 0.5708 (0.5928) data time 0.0006 (0.0032) model time 0.5702 (0.5948) loss 6.1409 (7.2574) grad_norm 2.0834 (2.4248) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:27:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][170/625] eta 0:04:29 lr 0.000421 wd 0.0500 time 0.5691 (0.5917) data time 0.0008 (0.0031) model time 0.5682 (0.5930) loss 6.3541 (7.2482) grad_norm 1.6302 (2.4133) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:27:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][180/625] eta 0:04:22 lr 0.000421 wd 0.0500 time 0.5703 (0.5907) data time 0.0008 (0.0030) model time 0.5695 (0.5915) loss 7.7814 (7.2586) grad_norm 2.3547 (2.4019) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:27:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][190/625] eta 0:04:16 lr 0.000421 wd 0.0500 time 0.5711 (0.5898) data time 0.0006 (0.0029) model time 0.5705 (0.5901) loss 7.5674 (7.2581) grad_norm 2.1070 (2.3881) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:27:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][200/625] eta 0:04:10 lr 0.000420 wd 0.0500 time 0.5717 (0.5890) data time 0.0006 (0.0028) model time 0.5711 (0.5889) loss 7.3633 (7.2422) grad_norm 1.9191 (2.4065) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:27:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][210/625] eta 0:04:04 lr 0.000420 wd 0.0500 time 0.5732 (0.5883) data time 0.0006 (0.0027) model time 0.5726 (0.5880) loss 6.1192 (7.2434) grad_norm 1.9033 (2.4029) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:27:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][220/625] eta 0:03:57 lr 0.000420 wd 0.0500 time 0.5713 (0.5876) data time 0.0008 (0.0026) model time 0.5705 (0.5871) loss 6.3483 (7.2287) grad_norm 2.6066 (2.3949) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:28:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][230/625] eta 0:03:52 lr 0.000420 wd 0.0500 time 0.5706 (0.5876) data time 0.0007 (0.0025) model time 0.5699 (0.5870) loss 7.4985 (7.2432) grad_norm 1.6044 (2.3906) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:28:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][240/625] eta 0:03:46 lr 0.000420 wd 0.0500 time 0.5706 (0.5871) data time 0.0006 (0.0025) model time 0.5700 (0.5863) loss 6.7342 (7.2399) grad_norm 4.0770 (2.4155) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:28:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][250/625] eta 0:03:39 lr 0.000420 wd 0.0500 time 0.5702 (0.5865) data time 0.0008 (0.0024) model time 0.5693 (0.5856) loss 7.9435 (7.2340) grad_norm 2.6235 (2.4220) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:28:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][260/625] eta 0:03:33 lr 0.000420 wd 0.0500 time 0.5761 (0.5860) data time 0.0007 (0.0023) model time 0.5753 (0.5850) loss 8.3799 (7.2312) grad_norm 2.4917 (2.4196) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:28:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][270/625] eta 0:03:27 lr 0.000420 wd 0.0500 time 0.5712 (0.5856) data time 0.0008 (0.0023) model time 0.5704 (0.5844) loss 7.9261 (7.2414) grad_norm 2.4748 (2.4108) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:28:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][280/625] eta 0:03:22 lr 0.000420 wd 0.0500 time 0.5747 (0.5856) data time 0.0008 (0.0022) model time 0.5739 (0.5846) loss 7.9905 (7.2510) grad_norm 2.0631 (2.4535) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:28:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][290/625] eta 0:03:16 lr 0.000420 wd 0.0500 time 0.5676 (0.5852) data time 0.0008 (0.0022) model time 0.5668 (0.5841) loss 6.4846 (7.2488) grad_norm 2.1598 (2.4566) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:28:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][300/625] eta 0:03:10 lr 0.000419 wd 0.0500 time 0.7651 (0.5863) data time 0.0008 (0.0021) model time 0.7642 (0.5854) loss 7.2821 (7.2432) grad_norm 2.2536 (2.4526) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:28:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][310/625] eta 0:03:05 lr 0.000419 wd 0.0500 time 0.5712 (0.5877) data time 0.0006 (0.0021) model time 0.5706 (0.5871) loss 6.8277 (7.2488) grad_norm 2.0111 (2.4425) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:28:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][320/625] eta 0:02:59 lr 0.000419 wd 0.0500 time 0.7264 (0.5886) data time 0.0006 (0.0021) model time 0.7258 (0.5880) loss 5.2548 (7.2429) grad_norm 1.9310 (2.4309) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:28:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][330/625] eta 0:02:53 lr 0.000419 wd 0.0500 time 0.5684 (0.5889) data time 0.0009 (0.0020) model time 0.5675 (0.5885) loss 7.3510 (7.2572) grad_norm 2.8040 (2.4471) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:29:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][340/625] eta 0:02:47 lr 0.000419 wd 0.0500 time 0.5663 (0.5886) data time 0.0008 (0.0020) model time 0.5655 (0.5881) loss 7.8657 (7.2622) grad_norm 2.2403 (2.4448) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:29:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][350/625] eta 0:02:41 lr 0.000419 wd 0.0500 time 0.5723 (0.5882) data time 0.0008 (0.0020) model time 0.5715 (0.5876) loss 6.7275 (7.2583) grad_norm 2.9368 (2.4463) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:29:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][360/625] eta 0:02:35 lr 0.000419 wd 0.0500 time 0.5742 (0.5878) data time 0.0008 (0.0019) model time 0.5734 (0.5871) loss 6.5725 (7.2542) grad_norm 3.6314 (2.4601) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:29:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][370/625] eta 0:02:29 lr 0.000419 wd 0.0500 time 0.5722 (0.5874) data time 0.0007 (0.0019) model time 0.5715 (0.5867) loss 8.1979 (7.2502) grad_norm 2.3262 (2.4640) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:29:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][380/625] eta 0:02:23 lr 0.000419 wd 0.0500 time 0.5717 (0.5871) data time 0.0006 (0.0019) model time 0.5711 (0.5863) loss 7.2563 (7.2524) grad_norm 2.1879 (2.4656) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:29:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][390/625] eta 0:02:17 lr 0.000418 wd 0.0500 time 0.5640 (0.5868) data time 0.0008 (0.0019) model time 0.5632 (0.5860) loss 7.0080 (7.2532) grad_norm 2.0169 (2.4557) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:29:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][400/625] eta 0:02:11 lr 0.000418 wd 0.0500 time 0.5709 (0.5865) data time 0.0008 (0.0018) model time 0.5701 (0.5856) loss 7.3985 (7.2520) grad_norm 2.9607 (2.4730) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:29:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][410/625] eta 0:02:06 lr 0.000418 wd 0.0500 time 0.5734 (0.5862) data time 0.0008 (0.0018) model time 0.5725 (0.5853) loss 8.4546 (7.2547) grad_norm 2.3400 (2.4680) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:29:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][420/625] eta 0:02:00 lr 0.000418 wd 0.0500 time 0.5733 (0.5859) data time 0.0008 (0.0018) model time 0.5725 (0.5850) loss 8.1855 (7.2563) grad_norm 2.4665 (2.4667) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:29:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][430/625] eta 0:01:54 lr 0.000418 wd 0.0500 time 0.5689 (0.5857) data time 0.0008 (0.0018) model time 0.5681 (0.5847) loss 6.6781 (7.2631) grad_norm 4.5886 (2.4708) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:30:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][440/625] eta 0:01:48 lr 0.000418 wd 0.0500 time 0.5705 (0.5854) data time 0.0007 (0.0018) model time 0.5698 (0.5843) loss 8.2963 (7.2637) grad_norm 1.8920 (2.4809) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:30:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][450/625] eta 0:01:42 lr 0.000418 wd 0.0500 time 0.5729 (0.5855) data time 0.0006 (0.0017) model time 0.5723 (0.5845) loss 7.1492 (7.2759) grad_norm 2.0039 (2.4749) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:30:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][460/625] eta 0:01:36 lr 0.000418 wd 0.0500 time 0.5706 (0.5854) data time 0.0006 (0.0017) model time 0.5700 (0.5844) loss 7.1725 (7.2806) grad_norm 2.0090 (2.4712) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:30:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][470/625] eta 0:01:30 lr 0.000418 wd 0.0500 time 0.5728 (0.5853) data time 0.0008 (0.0017) model time 0.5720 (0.5842) loss 6.1482 (7.2778) grad_norm 3.6049 (2.5198) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:30:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][480/625] eta 0:01:24 lr 0.000418 wd 0.0500 time 0.5752 (0.5851) data time 0.0008 (0.0017) model time 0.5744 (0.5840) loss 6.9055 (7.2773) grad_norm 2.2518 (2.5297) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:30:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][490/625] eta 0:01:18 lr 0.000417 wd 0.0500 time 0.5747 (0.5849) data time 0.0006 (0.0017) model time 0.5741 (0.5839) loss 6.7807 (7.2859) grad_norm 2.5076 (2.5302) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:30:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][500/625] eta 0:01:13 lr 0.000417 wd 0.0500 time 0.5747 (0.5852) data time 0.0008 (0.0017) model time 0.5740 (0.5841) loss 8.3907 (7.2980) grad_norm 5.4420 (2.5393) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:30:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][510/625] eta 0:01:07 lr 0.000417 wd 0.0500 time 0.5729 (0.5850) data time 0.0009 (0.0016) model time 0.5720 (0.5839) loss 7.8678 (7.2977) grad_norm 4.5853 (2.5554) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:30:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][520/625] eta 0:01:01 lr 0.000417 wd 0.0500 time 0.5677 (0.5858) data time 0.0006 (0.0016) model time 0.5671 (0.5849) loss 7.6171 (7.2919) grad_norm 1.9412 (2.5499) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:30:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][530/625] eta 0:00:55 lr 0.000417 wd 0.0500 time 0.5709 (0.5871) data time 0.0008 (0.0016) model time 0.5701 (0.5863) loss 7.3896 (7.2917) grad_norm 2.2728 (2.5436) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:31:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][540/625] eta 0:00:49 lr 0.000417 wd 0.0500 time 0.6904 (0.5882) data time 0.0008 (0.0016) model time 0.6896 (0.5875) loss 6.3321 (7.2918) grad_norm 1.9417 (2.5335) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:31:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][550/625] eta 0:00:44 lr 0.000417 wd 0.0500 time 0.5672 (0.5886) data time 0.0008 (0.0016) model time 0.5663 (0.5879) loss 7.3749 (7.2866) grad_norm 2.1624 (2.5293) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:31:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][560/625] eta 0:00:38 lr 0.000417 wd 0.0500 time 0.5727 (0.5884) data time 0.0006 (0.0016) model time 0.5721 (0.5877) loss 7.8281 (7.2909) grad_norm 2.4130 (2.5255) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:31:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][570/625] eta 0:00:32 lr 0.000417 wd 0.0500 time 0.5730 (0.5881) data time 0.0006 (0.0016) model time 0.5724 (0.5874) loss 7.8854 (7.2920) grad_norm 2.6876 (2.5221) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:31:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][580/625] eta 0:00:26 lr 0.000417 wd 0.0500 time 0.5692 (0.5879) data time 0.0007 (0.0015) model time 0.5685 (0.5872) loss 7.8230 (7.2905) grad_norm 2.2477 (2.5303) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:31:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][590/625] eta 0:00:20 lr 0.000416 wd 0.0500 time 0.5758 (0.5877) data time 0.0007 (0.0015) model time 0.5751 (0.5869) loss 5.6393 (7.2855) grad_norm 1.9150 (2.5237) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:31:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][600/625] eta 0:00:14 lr 0.000416 wd 0.0500 time 0.5625 (0.5875) data time 0.0006 (0.0015) model time 0.5618 (0.5867) loss 7.7511 (7.2820) grad_norm 3.1246 (2.5299) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:31:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][610/625] eta 0:00:08 lr 0.000416 wd 0.0500 time 0.5731 (0.5873) data time 0.0004 (0.0015) model time 0.5727 (0.5864) loss 7.4630 (7.2748) grad_norm 2.4502 (2.5274) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:31:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [188/300][620/625] eta 0:00:02 lr 0.000416 wd 0.0500 time 0.5725 (0.5871) data time 0.0004 (0.0015) model time 0.5721 (0.5863) loss 7.3183 (7.2802) grad_norm 2.1526 (2.5247) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:31:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 188 training takes 0:06:06 +[2024-07-25 09:31:51 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 09:31:52 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 09:31:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.491 (0.491) Loss 0.5107 (0.5107) Acc@1 90.186 (90.186) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 09:31:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.159) Loss 0.7832 (0.6317) Acc@1 82.373 (86.887) Acc@5 96.729 (97.892) Mem 22339MB +[2024-07-25 09:31:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.8940 (0.7319) Acc@1 78.613 (83.922) Acc@5 95.361 (96.880) Mem 22339MB +[2024-07-25 09:31:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.625 Acc@5 96.875 +[2024-07-25 09:31:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.6% +[2024-07-25 09:31:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.782 (0.782) Loss 0.5015 (0.5015) Acc@1 90.039 (90.039) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-25 09:31:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.185) Loss 0.7510 (0.6213) Acc@1 83.203 (87.216) Acc@5 96.680 (97.963) Mem 22339MB +[2024-07-25 09:31:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.157) Loss 0.8760 (0.7162) Acc@1 78.613 (84.224) Acc@5 95.801 (97.038) Mem 22339MB +[2024-07-25 09:32:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.831 Acc@5 97.021 +[2024-07-25 09:32:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.8% +[2024-07-25 09:32:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][0/625] eta 0:15:03 lr 0.000416 wd 0.0500 time 1.4461 (1.4461) data time 0.5511 (0.5511) model time 0.0000 (0.0000) loss 7.2810 (7.2810) grad_norm 3.4859 (3.4859) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:32:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][10/625] eta 0:06:47 lr 0.000416 wd 0.0500 time 0.5744 (0.6626) data time 0.0006 (0.0509) model time 0.0000 (0.0000) loss 8.7492 (7.6501) grad_norm 2.8923 (2.5168) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:32:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][20/625] eta 0:06:15 lr 0.000416 wd 0.0500 time 0.5778 (0.6202) data time 0.0008 (0.0271) model time 0.0000 (0.0000) loss 6.3548 (7.6798) grad_norm 3.4245 (2.6351) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:32:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][30/625] eta 0:05:59 lr 0.000416 wd 0.0500 time 0.5761 (0.6049) data time 0.0006 (0.0186) model time 0.0000 (0.0000) loss 8.4312 (7.5703) grad_norm 2.0243 (2.4700) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:32:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][40/625] eta 0:05:49 lr 0.000416 wd 0.0500 time 0.5709 (0.5971) data time 0.0007 (0.0143) model time 0.0000 (0.0000) loss 6.1057 (7.5413) grad_norm 2.8331 (2.5673) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:32:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][50/625] eta 0:05:40 lr 0.000416 wd 0.0500 time 0.5764 (0.5924) data time 0.0008 (0.0116) model time 0.0000 (0.0000) loss 7.8803 (7.5495) grad_norm 5.0168 (2.6011) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:32:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][60/625] eta 0:05:32 lr 0.000416 wd 0.0500 time 0.5748 (0.5893) data time 0.0008 (0.0099) model time 0.5740 (0.5727) loss 7.7348 (7.5440) grad_norm 1.8733 (2.6441) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:32:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][70/625] eta 0:05:25 lr 0.000415 wd 0.0500 time 0.5826 (0.5872) data time 0.0007 (0.0086) model time 0.5819 (0.5733) loss 8.2468 (7.5413) grad_norm 2.1069 (2.6194) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:32:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][80/625] eta 0:05:19 lr 0.000415 wd 0.0500 time 0.5795 (0.5854) data time 0.0008 (0.0076) model time 0.5788 (0.5729) loss 6.7794 (7.5496) grad_norm 2.0690 (2.5600) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:32:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][90/625] eta 0:05:12 lr 0.000415 wd 0.0500 time 0.5757 (0.5841) data time 0.0006 (0.0069) model time 0.5751 (0.5728) loss 6.5709 (7.5241) grad_norm 1.9250 (2.5473) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:32:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][100/625] eta 0:05:07 lr 0.000415 wd 0.0500 time 0.5740 (0.5856) data time 0.0008 (0.0063) model time 0.5732 (0.5779) loss 7.1698 (7.4965) grad_norm 2.1982 (2.5216) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:33:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][110/625] eta 0:05:02 lr 0.000415 wd 0.0500 time 0.6543 (0.5866) data time 0.0007 (0.0058) model time 0.6536 (0.5809) loss 6.9242 (7.4407) grad_norm 2.5655 (2.4874) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:33:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][120/625] eta 0:04:57 lr 0.000415 wd 0.0500 time 0.7215 (0.5888) data time 0.0008 (0.0054) model time 0.7207 (0.5853) loss 5.6892 (7.4368) grad_norm 1.5887 (2.4688) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:33:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][130/625] eta 0:04:52 lr 0.000415 wd 0.0500 time 0.5801 (0.5901) data time 0.0007 (0.0051) model time 0.5794 (0.5878) loss 7.5610 (7.4418) grad_norm 5.2351 (2.4968) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:33:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][140/625] eta 0:04:47 lr 0.000415 wd 0.0500 time 0.5689 (0.5931) data time 0.0007 (0.0048) model time 0.5683 (0.5926) loss 8.5402 (7.4170) grad_norm 2.8481 (2.5524) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:33:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][150/625] eta 0:04:41 lr 0.000415 wd 0.0500 time 0.5753 (0.5932) data time 0.0007 (0.0045) model time 0.5746 (0.5928) loss 7.4350 (7.4071) grad_norm 2.1518 (2.5307) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:33:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][160/625] eta 0:04:35 lr 0.000415 wd 0.0500 time 0.5710 (0.5927) data time 0.0008 (0.0043) model time 0.5702 (0.5920) loss 7.2389 (7.4237) grad_norm 2.1732 (2.5167) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:33:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][170/625] eta 0:04:29 lr 0.000414 wd 0.0500 time 0.5760 (0.5916) data time 0.0006 (0.0041) model time 0.5754 (0.5904) loss 7.2786 (7.4150) grad_norm 1.7207 (2.4976) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:33:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][180/625] eta 0:04:22 lr 0.000414 wd 0.0500 time 0.5746 (0.5906) data time 0.0006 (0.0039) model time 0.5740 (0.5891) loss 8.4957 (7.4468) grad_norm 3.0918 (2.4905) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:33:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][190/625] eta 0:04:16 lr 0.000414 wd 0.0500 time 0.5772 (0.5898) data time 0.0008 (0.0037) model time 0.5764 (0.5880) loss 7.8704 (7.4453) grad_norm 2.8667 (2.5116) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:33:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][200/625] eta 0:04:10 lr 0.000414 wd 0.0500 time 0.5757 (0.5891) data time 0.0008 (0.0036) model time 0.5749 (0.5871) loss 8.0168 (7.4377) grad_norm 2.6619 (2.5129) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:34:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][210/625] eta 0:04:04 lr 0.000414 wd 0.0500 time 0.5746 (0.5884) data time 0.0008 (0.0035) model time 0.5738 (0.5863) loss 7.1613 (7.4350) grad_norm 2.7491 (2.5003) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:34:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][220/625] eta 0:03:58 lr 0.000414 wd 0.0500 time 0.5728 (0.5878) data time 0.0008 (0.0033) model time 0.5720 (0.5855) loss 8.4965 (7.4343) grad_norm 2.4757 (2.4940) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:34:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][230/625] eta 0:03:52 lr 0.000414 wd 0.0500 time 0.5743 (0.5879) data time 0.0006 (0.0032) model time 0.5737 (0.5858) loss 7.3129 (7.4258) grad_norm 12.7587 (2.5285) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:34:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][240/625] eta 0:03:46 lr 0.000414 wd 0.0500 time 0.5719 (0.5874) data time 0.0006 (0.0032) model time 0.5713 (0.5851) loss 7.7153 (7.4140) grad_norm 1.7159 (2.5185) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:34:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][250/625] eta 0:03:40 lr 0.000414 wd 0.0500 time 0.5711 (0.5868) data time 0.0006 (0.0031) model time 0.5705 (0.5845) loss 7.4522 (7.4167) grad_norm 5.7050 (2.5405) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:34:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][260/625] eta 0:03:34 lr 0.000413 wd 0.0500 time 0.5752 (0.5864) data time 0.0007 (0.0030) model time 0.5744 (0.5840) loss 7.6316 (7.4104) grad_norm 2.2212 (2.5328) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:34:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][270/625] eta 0:03:28 lr 0.000413 wd 0.0500 time 0.5741 (0.5860) data time 0.0008 (0.0030) model time 0.5733 (0.5836) loss 7.2070 (7.4130) grad_norm 1.9956 (2.5340) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:34:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][280/625] eta 0:03:22 lr 0.000413 wd 0.0500 time 0.5767 (0.5856) data time 0.0007 (0.0029) model time 0.5761 (0.5832) loss 7.8249 (7.4123) grad_norm 3.2583 (2.5434) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:34:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][290/625] eta 0:03:16 lr 0.000413 wd 0.0500 time 0.5738 (0.5852) data time 0.0006 (0.0028) model time 0.5732 (0.5827) loss 7.5477 (7.4143) grad_norm 1.8708 (2.5357) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:34:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][300/625] eta 0:03:10 lr 0.000413 wd 0.0500 time 0.6287 (0.5850) data time 0.0008 (0.0028) model time 0.6279 (0.5826) loss 7.2919 (7.4162) grad_norm 1.6714 (2.5194) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:35:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][310/625] eta 0:03:04 lr 0.000413 wd 0.0500 time 0.5795 (0.5848) data time 0.0006 (0.0028) model time 0.5790 (0.5823) loss 8.7262 (7.4154) grad_norm 1.6943 (2.5049) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:35:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][320/625] eta 0:02:58 lr 0.000413 wd 0.0500 time 0.5742 (0.5849) data time 0.0008 (0.0027) model time 0.5734 (0.5825) loss 7.0124 (7.4118) grad_norm 1.7151 (2.4895) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:35:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][330/625] eta 0:02:52 lr 0.000413 wd 0.0500 time 0.7064 (0.5853) data time 0.0009 (0.0026) model time 0.7055 (0.5830) loss 6.5496 (7.3949) grad_norm 2.8627 (2.4808) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:35:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][340/625] eta 0:02:47 lr 0.000413 wd 0.0500 time 0.7448 (0.5865) data time 0.0010 (0.0026) model time 0.7438 (0.5845) loss 8.2818 (7.3846) grad_norm 2.4261 (2.4790) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:35:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][350/625] eta 0:02:41 lr 0.000413 wd 0.0500 time 0.7082 (0.5874) data time 0.0009 (0.0027) model time 0.7074 (0.5854) loss 6.7322 (7.3799) grad_norm 2.2635 (2.4748) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:35:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][360/625] eta 0:02:36 lr 0.000412 wd 0.0500 time 0.7174 (0.5888) data time 0.0006 (0.0026) model time 0.7168 (0.5871) loss 7.4379 (7.3801) grad_norm 2.5954 (2.4801) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:35:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][370/625] eta 0:02:30 lr 0.000412 wd 0.0500 time 0.5750 (0.5892) data time 0.0007 (0.0026) model time 0.5743 (0.5875) loss 8.4887 (7.3674) grad_norm 2.8615 (2.4886) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:35:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][380/625] eta 0:02:24 lr 0.000412 wd 0.0500 time 0.5780 (0.5889) data time 0.0006 (0.0025) model time 0.5774 (0.5873) loss 8.0554 (7.3640) grad_norm 2.6889 (2.5057) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:35:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][390/625] eta 0:02:18 lr 0.000412 wd 0.0500 time 0.5763 (0.5887) data time 0.0007 (0.0025) model time 0.5756 (0.5870) loss 7.7908 (7.3705) grad_norm 2.6737 (2.5265) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:35:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][400/625] eta 0:02:12 lr 0.000412 wd 0.0500 time 0.5729 (0.5883) data time 0.0009 (0.0025) model time 0.5721 (0.5866) loss 7.6616 (7.3692) grad_norm 3.8047 (2.5472) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:36:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][410/625] eta 0:02:06 lr 0.000412 wd 0.0500 time 0.5877 (0.5881) data time 0.0009 (0.0024) model time 0.5868 (0.5863) loss 6.3228 (7.3642) grad_norm 3.8918 (2.5610) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:36:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][420/625] eta 0:02:00 lr 0.000412 wd 0.0500 time 0.5716 (0.5877) data time 0.0006 (0.0024) model time 0.5710 (0.5859) loss 6.9607 (7.3658) grad_norm 2.1235 (2.5617) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:36:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][430/625] eta 0:01:54 lr 0.000412 wd 0.0500 time 0.5748 (0.5874) data time 0.0008 (0.0024) model time 0.5740 (0.5856) loss 8.2516 (7.3648) grad_norm 1.8829 (2.5574) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:36:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][440/625] eta 0:01:48 lr 0.000412 wd 0.0500 time 0.5788 (0.5871) data time 0.0008 (0.0023) model time 0.5780 (0.5853) loss 5.6003 (7.3623) grad_norm 2.1638 (2.5574) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:36:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][450/625] eta 0:01:42 lr 0.000412 wd 0.0500 time 0.5870 (0.5869) data time 0.0008 (0.0023) model time 0.5862 (0.5851) loss 7.7964 (7.3674) grad_norm 1.6629 (2.5574) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:36:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][460/625] eta 0:01:36 lr 0.000411 wd 0.0500 time 0.5743 (0.5866) data time 0.0008 (0.0023) model time 0.5735 (0.5848) loss 8.5015 (7.3696) grad_norm 1.8347 (2.5581) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:36:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][470/625] eta 0:01:30 lr 0.000411 wd 0.0500 time 0.6360 (0.5865) data time 0.0006 (0.0023) model time 0.6354 (0.5847) loss 6.8097 (7.3697) grad_norm 2.5220 (2.5529) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:36:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][480/625] eta 0:01:25 lr 0.000411 wd 0.0500 time 0.5727 (0.5862) data time 0.0008 (0.0022) model time 0.5719 (0.5844) loss 7.6424 (7.3696) grad_norm 2.3027 (2.5565) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:36:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][490/625] eta 0:01:19 lr 0.000411 wd 0.0500 time 0.5734 (0.5860) data time 0.0006 (0.0022) model time 0.5728 (0.5842) loss 8.6408 (7.3781) grad_norm 1.8016 (2.5485) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:36:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][500/625] eta 0:01:13 lr 0.000411 wd 0.0500 time 0.6896 (0.5860) data time 0.0008 (0.0022) model time 0.6888 (0.5842) loss 7.8654 (7.3674) grad_norm 2.6713 (2.5398) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:36:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][510/625] eta 0:01:07 lr 0.000411 wd 0.0500 time 0.5785 (0.5858) data time 0.0006 (0.0021) model time 0.5779 (0.5839) loss 5.8248 (7.3599) grad_norm 1.9645 (2.5385) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:37:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][520/625] eta 0:01:01 lr 0.000411 wd 0.0500 time 0.5826 (0.5856) data time 0.0009 (0.0021) model time 0.5817 (0.5837) loss 7.1575 (7.3625) grad_norm 3.3483 (2.5336) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:37:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][530/625] eta 0:00:55 lr 0.000411 wd 0.0500 time 0.5773 (0.5854) data time 0.0008 (0.0021) model time 0.5766 (0.5835) loss 6.0428 (7.3612) grad_norm 1.6584 (2.5276) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:37:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][540/625] eta 0:00:49 lr 0.000411 wd 0.0500 time 0.5769 (0.5854) data time 0.0006 (0.0021) model time 0.5763 (0.5836) loss 6.4385 (7.3540) grad_norm 1.7048 (2.5182) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:37:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][550/625] eta 0:00:43 lr 0.000411 wd 0.0500 time 0.7090 (0.5859) data time 0.0008 (0.0021) model time 0.7082 (0.5841) loss 8.4681 (7.3552) grad_norm 2.5115 (2.5168) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:37:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][560/625] eta 0:00:38 lr 0.000410 wd 0.0500 time 0.7417 (0.5866) data time 0.0006 (0.0020) model time 0.7410 (0.5849) loss 6.4000 (7.3563) grad_norm 4.7916 (2.5258) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:37:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][570/625] eta 0:00:32 lr 0.000410 wd 0.0500 time 0.7407 (0.5870) data time 0.0009 (0.0020) model time 0.7398 (0.5854) loss 8.5192 (7.3639) grad_norm 2.1412 (2.5225) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:37:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][580/625] eta 0:00:26 lr 0.000410 wd 0.0500 time 0.7349 (0.5875) data time 0.0007 (0.0020) model time 0.7342 (0.5859) loss 8.0236 (7.3704) grad_norm 2.6145 (2.5245) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:37:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][590/625] eta 0:00:20 lr 0.000410 wd 0.0500 time 0.5765 (0.5876) data time 0.0006 (0.0020) model time 0.5759 (0.5861) loss 6.1711 (7.3702) grad_norm 1.5235 (2.5211) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:37:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][600/625] eta 0:00:14 lr 0.000410 wd 0.0500 time 0.5821 (0.5875) data time 0.0008 (0.0020) model time 0.5813 (0.5860) loss 8.5210 (7.3750) grad_norm 1.8755 (2.5162) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:37:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][610/625] eta 0:00:08 lr 0.000410 wd 0.0500 time 0.5819 (0.5873) data time 0.0005 (0.0019) model time 0.5814 (0.5858) loss 7.2307 (7.3701) grad_norm 1.6532 (2.5139) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:38:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [189/300][620/625] eta 0:00:02 lr 0.000410 wd 0.0500 time 0.5761 (0.5871) data time 0.0004 (0.0019) model time 0.5757 (0.5856) loss 6.1961 (7.3715) grad_norm 1.6697 (2.5135) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:38:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 189 training takes 0:06:06 +[2024-07-25 09:38:06 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 09:38:08 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 09:38:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.482 (0.482) Loss 0.5234 (0.5234) Acc@1 89.893 (89.893) Acc@5 98.633 (98.633) Mem 22339MB +[2024-07-25 09:38:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.7808 (0.6388) Acc@1 82.422 (86.981) Acc@5 96.289 (97.820) Mem 22339MB +[2024-07-25 09:38:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.9033 (0.7393) Acc@1 78.711 (84.019) Acc@5 95.947 (96.922) Mem 22339MB +[2024-07-25 09:38:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.651 Acc@5 96.899 +[2024-07-25 09:38:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.7% +[2024-07-25 09:38:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 83.65% +[2024-07-25 09:38:11 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 09:38:13 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 09:38:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.478 (0.478) Loss 0.5015 (0.5015) Acc@1 90.137 (90.137) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-25 09:38:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.7520 (0.6214) Acc@1 83.203 (87.247) Acc@5 96.680 (97.971) Mem 22339MB +[2024-07-25 09:38:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8755 (0.7162) Acc@1 78.760 (84.261) Acc@5 95.801 (97.052) Mem 22339MB +[2024-07-25 09:38:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.869 Acc@5 97.035 +[2024-07-25 09:38:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.9% +[2024-07-25 09:38:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.87% +[2024-07-25 09:38:17 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 09:38:18 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 09:38:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][0/625] eta 0:09:28 lr 0.000410 wd 0.0500 time 0.9101 (0.9101) data time 0.3932 (0.3932) model time 0.0000 (0.0000) loss 6.7649 (6.7649) grad_norm 2.7313 (2.7313) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:38:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][10/625] eta 0:06:11 lr 0.000410 wd 0.0500 time 0.5692 (0.6033) data time 0.0008 (0.0369) model time 0.0000 (0.0000) loss 6.0890 (7.4257) grad_norm 2.3551 (2.3149) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:38:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][20/625] eta 0:05:56 lr 0.000410 wd 0.0500 time 0.5632 (0.5890) data time 0.0006 (0.0197) model time 0.0000 (0.0000) loss 6.3132 (7.0945) grad_norm 1.7643 (2.3654) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:38:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][30/625] eta 0:05:47 lr 0.000410 wd 0.0500 time 0.5707 (0.5841) data time 0.0008 (0.0136) model time 0.0000 (0.0000) loss 8.2151 (7.2305) grad_norm 1.6787 (2.3752) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:38:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][40/625] eta 0:05:40 lr 0.000409 wd 0.0500 time 0.5716 (0.5812) data time 0.0008 (0.0105) model time 0.0000 (0.0000) loss 7.8887 (7.3833) grad_norm 1.7777 (2.3222) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:38:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][50/625] eta 0:05:33 lr 0.000409 wd 0.0500 time 0.5700 (0.5796) data time 0.0008 (0.0086) model time 0.0000 (0.0000) loss 6.1829 (7.3640) grad_norm 1.7255 (2.2148) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:38:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][60/625] eta 0:05:27 lr 0.000409 wd 0.0500 time 0.5719 (0.5789) data time 0.0006 (0.0073) model time 0.5713 (0.5742) loss 7.4600 (7.3285) grad_norm 1.8831 (2.1622) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:38:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][70/625] eta 0:05:20 lr 0.000409 wd 0.0500 time 0.5720 (0.5782) data time 0.0006 (0.0064) model time 0.5714 (0.5736) loss 6.4968 (7.3417) grad_norm 2.0673 (2.1652) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:39:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][80/625] eta 0:05:14 lr 0.000409 wd 0.0500 time 0.5717 (0.5779) data time 0.0006 (0.0057) model time 0.5711 (0.5743) loss 5.6962 (7.3263) grad_norm 2.6759 (2.1968) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:39:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][90/625] eta 0:05:09 lr 0.000409 wd 0.0500 time 0.5695 (0.5776) data time 0.0008 (0.0052) model time 0.5687 (0.5742) loss 7.7004 (7.3255) grad_norm 1.9833 (2.1885) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:39:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][100/625] eta 0:05:03 lr 0.000409 wd 0.0500 time 0.5748 (0.5773) data time 0.0006 (0.0048) model time 0.5742 (0.5741) loss 7.8383 (7.2812) grad_norm 2.4191 (2.2438) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:39:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][110/625] eta 0:04:57 lr 0.000409 wd 0.0500 time 0.5721 (0.5770) data time 0.0007 (0.0044) model time 0.5714 (0.5739) loss 6.7200 (7.3164) grad_norm 2.1300 (2.2922) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:39:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][120/625] eta 0:04:51 lr 0.000409 wd 0.0500 time 0.5737 (0.5770) data time 0.0008 (0.0041) model time 0.5729 (0.5742) loss 6.7994 (7.3050) grad_norm 2.5753 (2.2948) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:39:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][130/625] eta 0:04:45 lr 0.000409 wd 0.0500 time 0.5711 (0.5772) data time 0.0007 (0.0039) model time 0.5704 (0.5748) loss 8.2365 (7.2895) grad_norm 2.0729 (2.3248) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:39:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][140/625] eta 0:04:40 lr 0.000408 wd 0.0500 time 0.5759 (0.5779) data time 0.0006 (0.0037) model time 0.5753 (0.5761) loss 7.9865 (7.3236) grad_norm 1.9004 (2.3722) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:39:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][150/625] eta 0:04:35 lr 0.000408 wd 0.0500 time 0.5746 (0.5800) data time 0.0008 (0.0035) model time 0.5738 (0.5793) loss 7.7410 (7.3164) grad_norm 1.6637 (2.3627) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:39:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][160/625] eta 0:04:31 lr 0.000408 wd 0.0500 time 0.7414 (0.5843) data time 0.0006 (0.0033) model time 0.7408 (0.5856) loss 8.5300 (7.3464) grad_norm 2.0987 (2.3420) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:39:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][170/625] eta 0:04:26 lr 0.000408 wd 0.0500 time 0.5685 (0.5859) data time 0.0008 (0.0032) model time 0.5676 (0.5878) loss 8.6548 (7.3708) grad_norm 2.5456 (2.3310) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:40:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][180/625] eta 0:04:21 lr 0.000408 wd 0.0500 time 0.5710 (0.5884) data time 0.0007 (0.0030) model time 0.5704 (0.5910) loss 7.1265 (7.3655) grad_norm 2.5945 (2.3713) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:40:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][190/625] eta 0:04:15 lr 0.000408 wd 0.0500 time 0.5624 (0.5884) data time 0.0006 (0.0029) model time 0.5618 (0.5907) loss 9.0723 (7.3938) grad_norm 1.9927 (2.3584) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:40:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][200/625] eta 0:04:09 lr 0.000408 wd 0.0500 time 0.5739 (0.5877) data time 0.0006 (0.0028) model time 0.5732 (0.5895) loss 7.1517 (7.4024) grad_norm 2.4314 (2.3630) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:40:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][210/625] eta 0:04:03 lr 0.000408 wd 0.0500 time 0.5731 (0.5871) data time 0.0006 (0.0027) model time 0.5726 (0.5886) loss 8.3352 (7.3883) grad_norm 2.6710 (2.3976) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:40:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][220/625] eta 0:03:57 lr 0.000408 wd 0.0500 time 0.5653 (0.5865) data time 0.0008 (0.0026) model time 0.5645 (0.5877) loss 9.0695 (7.4050) grad_norm 2.1062 (2.4151) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:40:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][230/625] eta 0:03:51 lr 0.000408 wd 0.0500 time 0.5716 (0.5859) data time 0.0006 (0.0026) model time 0.5710 (0.5868) loss 7.4544 (7.4046) grad_norm 3.9375 (2.4573) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:40:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][240/625] eta 0:03:45 lr 0.000407 wd 0.0500 time 0.5700 (0.5854) data time 0.0008 (0.0025) model time 0.5692 (0.5861) loss 8.0111 (7.4141) grad_norm 2.3980 (2.4761) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:40:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][250/625] eta 0:03:39 lr 0.000407 wd 0.0500 time 0.5731 (0.5850) data time 0.0008 (0.0024) model time 0.5723 (0.5855) loss 7.2566 (7.3982) grad_norm 2.5439 (2.4615) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:40:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][260/625] eta 0:03:33 lr 0.000407 wd 0.0500 time 0.5723 (0.5846) data time 0.0006 (0.0024) model time 0.5716 (0.5850) loss 7.8581 (7.3892) grad_norm 2.1795 (2.4965) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:40:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][270/625] eta 0:03:27 lr 0.000407 wd 0.0500 time 0.5717 (0.5842) data time 0.0008 (0.0023) model time 0.5709 (0.5844) loss 7.8639 (7.3904) grad_norm 2.2836 (2.4847) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:41:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][280/625] eta 0:03:21 lr 0.000407 wd 0.0500 time 0.5718 (0.5838) data time 0.0008 (0.0023) model time 0.5710 (0.5839) loss 8.3027 (7.3897) grad_norm 1.7400 (2.4748) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:41:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][290/625] eta 0:03:15 lr 0.000407 wd 0.0500 time 0.5777 (0.5836) data time 0.0007 (0.0022) model time 0.5769 (0.5835) loss 7.5712 (7.3973) grad_norm 1.8916 (2.4733) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:41:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][300/625] eta 0:03:09 lr 0.000407 wd 0.0500 time 0.5695 (0.5833) data time 0.0008 (0.0022) model time 0.5687 (0.5831) loss 8.7286 (7.4010) grad_norm 3.4936 (2.4801) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:41:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][310/625] eta 0:03:03 lr 0.000407 wd 0.0500 time 0.5711 (0.5830) data time 0.0006 (0.0021) model time 0.5704 (0.5828) loss 7.8480 (7.4084) grad_norm 7.4112 (2.5055) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:41:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][320/625] eta 0:02:57 lr 0.000407 wd 0.0500 time 0.5742 (0.5828) data time 0.0008 (0.0021) model time 0.5734 (0.5825) loss 7.9823 (7.4237) grad_norm 3.1850 (2.5762) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:41:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][330/625] eta 0:02:51 lr 0.000406 wd 0.0500 time 0.5753 (0.5825) data time 0.0006 (0.0021) model time 0.5747 (0.5822) loss 7.6742 (7.4195) grad_norm 2.8889 (2.5884) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:41:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][340/625] eta 0:02:45 lr 0.000406 wd 0.0500 time 0.5755 (0.5823) data time 0.0007 (0.0020) model time 0.5749 (0.5819) loss 6.5493 (7.4201) grad_norm 1.8420 (2.5787) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:41:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][350/625] eta 0:02:40 lr 0.000406 wd 0.0500 time 0.6257 (0.5823) data time 0.0006 (0.0020) model time 0.6251 (0.5818) loss 5.7500 (7.3999) grad_norm 1.9623 (2.5818) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:41:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][360/625] eta 0:02:34 lr 0.000406 wd 0.0500 time 0.5677 (0.5825) data time 0.0006 (0.0020) model time 0.5671 (0.5821) loss 6.7164 (7.4048) grad_norm 2.6065 (2.5917) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:41:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][370/625] eta 0:02:28 lr 0.000406 wd 0.0500 time 0.5698 (0.5828) data time 0.0008 (0.0019) model time 0.5690 (0.5825) loss 7.5066 (7.4144) grad_norm 2.2512 (2.5899) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:42:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][380/625] eta 0:02:23 lr 0.000406 wd 0.0500 time 0.7338 (0.5841) data time 0.0007 (0.0019) model time 0.7331 (0.5839) loss 6.0408 (7.4154) grad_norm 2.3266 (2.6076) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:42:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][390/625] eta 0:02:17 lr 0.000406 wd 0.0500 time 0.7552 (0.5851) data time 0.0008 (0.0019) model time 0.7544 (0.5851) loss 8.1707 (7.4140) grad_norm 2.1516 (2.6056) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:42:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][400/625] eta 0:02:11 lr 0.000406 wd 0.0500 time 0.5720 (0.5862) data time 0.0006 (0.0019) model time 0.5714 (0.5863) loss 7.1253 (7.4288) grad_norm 1.8531 (2.5978) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:42:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][410/625] eta 0:02:06 lr 0.000406 wd 0.0500 time 0.5674 (0.5863) data time 0.0007 (0.0018) model time 0.5668 (0.5864) loss 6.2214 (7.4298) grad_norm 1.8783 (2.5867) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:42:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][420/625] eta 0:02:00 lr 0.000406 wd 0.0500 time 0.5711 (0.5860) data time 0.0009 (0.0018) model time 0.5702 (0.5860) loss 6.3324 (7.4142) grad_norm 2.0469 (2.5731) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:42:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][430/625] eta 0:01:54 lr 0.000405 wd 0.0500 time 0.5726 (0.5858) data time 0.0008 (0.0018) model time 0.5717 (0.5857) loss 7.0034 (7.4275) grad_norm 1.7233 (2.5585) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:42:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][440/625] eta 0:01:48 lr 0.000405 wd 0.0500 time 0.5649 (0.5855) data time 0.0006 (0.0018) model time 0.5643 (0.5854) loss 6.3023 (7.4193) grad_norm 2.4260 (2.5490) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:42:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][450/625] eta 0:01:42 lr 0.000405 wd 0.0500 time 0.5686 (0.5853) data time 0.0006 (0.0018) model time 0.5680 (0.5851) loss 7.5662 (7.4191) grad_norm 2.8829 (2.5397) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:42:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][460/625] eta 0:01:36 lr 0.000405 wd 0.0500 time 0.5742 (0.5850) data time 0.0006 (0.0017) model time 0.5736 (0.5848) loss 5.9112 (7.4181) grad_norm 3.1731 (2.5373) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:42:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][470/625] eta 0:01:30 lr 0.000405 wd 0.0500 time 0.5703 (0.5848) data time 0.0008 (0.0017) model time 0.5695 (0.5845) loss 8.8758 (7.4234) grad_norm 3.1068 (2.5437) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:42:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][480/625] eta 0:01:24 lr 0.000405 wd 0.0500 time 0.5732 (0.5846) data time 0.0008 (0.0017) model time 0.5724 (0.5843) loss 9.1430 (7.4317) grad_norm 2.0483 (2.5490) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:43:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][490/625] eta 0:01:18 lr 0.000405 wd 0.0500 time 0.5731 (0.5844) data time 0.0008 (0.0017) model time 0.5723 (0.5841) loss 8.4553 (7.4324) grad_norm 2.2038 (2.5453) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:43:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][500/625] eta 0:01:13 lr 0.000405 wd 0.0500 time 0.5762 (0.5842) data time 0.0006 (0.0017) model time 0.5756 (0.5839) loss 6.4328 (7.4278) grad_norm 3.9704 (2.5499) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:43:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][510/625] eta 0:01:07 lr 0.000405 wd 0.0500 time 0.5717 (0.5840) data time 0.0006 (0.0017) model time 0.5711 (0.5836) loss 8.4322 (7.4294) grad_norm 4.3774 (2.5484) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:43:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][520/625] eta 0:01:01 lr 0.000405 wd 0.0500 time 0.5755 (0.5839) data time 0.0008 (0.0016) model time 0.5747 (0.5835) loss 8.8225 (7.4336) grad_norm 1.5674 (2.5405) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:43:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][530/625] eta 0:00:55 lr 0.000404 wd 0.0500 time 0.5740 (0.5837) data time 0.0008 (0.0016) model time 0.5733 (0.5833) loss 7.2678 (7.4395) grad_norm 2.1039 (2.5386) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:43:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][540/625] eta 0:00:49 lr 0.000404 wd 0.0500 time 0.5675 (0.5836) data time 0.0007 (0.0016) model time 0.5669 (0.5831) loss 7.3731 (7.4352) grad_norm 2.1553 (2.5407) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:43:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][550/625] eta 0:00:43 lr 0.000404 wd 0.0500 time 0.5718 (0.5834) data time 0.0008 (0.0016) model time 0.5710 (0.5829) loss 8.4213 (7.4341) grad_norm 2.5813 (2.5419) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:43:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][560/625] eta 0:00:37 lr 0.000404 wd 0.0500 time 0.5719 (0.5833) data time 0.0008 (0.0016) model time 0.5711 (0.5828) loss 8.1332 (7.4293) grad_norm 1.6460 (2.5308) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:43:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][570/625] eta 0:00:32 lr 0.000404 wd 0.0500 time 0.6987 (0.5834) data time 0.0008 (0.0016) model time 0.6979 (0.5829) loss 8.8472 (7.4227) grad_norm 2.6562 (2.5258) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:43:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][580/625] eta 0:00:26 lr 0.000404 wd 0.0500 time 0.5724 (0.5833) data time 0.0009 (0.0016) model time 0.5715 (0.5828) loss 7.7859 (7.4266) grad_norm 2.1928 (2.5288) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:44:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][590/625] eta 0:00:20 lr 0.000404 wd 0.0500 time 0.5762 (0.5835) data time 0.0006 (0.0015) model time 0.5755 (0.5830) loss 6.3992 (7.4264) grad_norm 1.6944 (2.5232) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:44:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][600/625] eta 0:00:14 lr 0.000404 wd 0.0500 time 0.5709 (0.5843) data time 0.0006 (0.0015) model time 0.5703 (0.5839) loss 7.4575 (7.4310) grad_norm 8.0673 (2.5294) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:44:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][610/625] eta 0:00:08 lr 0.000404 wd 0.0500 time 0.7163 (0.5850) data time 0.0004 (0.0015) model time 0.7159 (0.5846) loss 6.2637 (7.4285) grad_norm 3.0245 (2.5384) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:44:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [190/300][620/625] eta 0:00:02 lr 0.000404 wd 0.0500 time 0.5753 (0.5856) data time 0.0006 (0.0015) model time 0.5748 (0.5853) loss 5.5601 (7.4279) grad_norm 3.8336 (2.5485) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:44:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 190 training takes 0:06:05 +[2024-07-25 09:44:24 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 09:44:26 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 09:44:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.480 (0.480) Loss 0.4961 (0.4961) Acc@1 90.381 (90.381) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 09:44:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.7764 (0.6278) Acc@1 82.666 (87.149) Acc@5 96.289 (97.869) Mem 22339MB +[2024-07-25 09:44:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.8774 (0.7290) Acc@1 78.857 (84.056) Acc@5 95.557 (96.852) Mem 22339MB +[2024-07-25 09:44:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.649 Acc@5 96.825 +[2024-07-25 09:44:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.6% +[2024-07-25 09:44:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.926 (0.926) Loss 0.5015 (0.5015) Acc@1 90.186 (90.186) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-25 09:44:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.198) Loss 0.7520 (0.6215) Acc@1 83.301 (87.269) Acc@5 96.533 (97.954) Mem 22339MB +[2024-07-25 09:44:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.164) Loss 0.8755 (0.7162) Acc@1 78.760 (84.294) Acc@5 95.703 (97.040) Mem 22339MB +[2024-07-25 09:44:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.899 Acc@5 97.031 +[2024-07-25 09:44:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.9% +[2024-07-25 09:44:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.90% +[2024-07-25 09:44:33 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 09:44:34 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 09:44:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][0/625] eta 0:09:00 lr 0.000404 wd 0.0500 time 0.8645 (0.8645) data time 0.3468 (0.3468) model time 0.0000 (0.0000) loss 7.2725 (7.2725) grad_norm 5.7703 (5.7703) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 09:44:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][10/625] eta 0:06:17 lr 0.000403 wd 0.0500 time 0.5729 (0.6143) data time 0.0005 (0.0324) model time 0.0000 (0.0000) loss 6.5491 (7.3724) grad_norm 1.9273 (3.5511) loss_scale 1024.0000 (977.4545) mem 22339MB +[2024-07-25 09:44:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][20/625] eta 0:06:00 lr 0.000403 wd 0.0500 time 0.5623 (0.5952) data time 0.0005 (0.0174) model time 0.0000 (0.0000) loss 5.8288 (7.0503) grad_norm 2.5136 (3.0010) loss_scale 1024.0000 (999.6190) mem 22339MB +[2024-07-25 09:44:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][30/625] eta 0:05:50 lr 0.000403 wd 0.0500 time 0.5741 (0.5897) data time 0.0008 (0.0120) model time 0.0000 (0.0000) loss 8.7622 (7.0696) grad_norm 2.2432 (2.7632) loss_scale 1024.0000 (1007.4839) mem 22339MB +[2024-07-25 09:44:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][40/625] eta 0:05:43 lr 0.000403 wd 0.0500 time 0.5726 (0.5867) data time 0.0008 (0.0093) model time 0.0000 (0.0000) loss 6.1771 (7.0529) grad_norm 1.7259 (2.6563) loss_scale 1024.0000 (1011.5122) mem 22339MB +[2024-07-25 09:45:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][50/625] eta 0:05:36 lr 0.000403 wd 0.0500 time 0.5767 (0.5849) data time 0.0007 (0.0077) model time 0.0000 (0.0000) loss 8.2492 (7.2016) grad_norm 2.3089 (2.6823) loss_scale 1024.0000 (1013.9608) mem 22339MB +[2024-07-25 09:45:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][60/625] eta 0:05:29 lr 0.000403 wd 0.0500 time 0.5700 (0.5833) data time 0.0006 (0.0066) model time 0.5694 (0.5742) loss 6.2368 (7.2878) grad_norm 1.9005 (2.6454) loss_scale 1024.0000 (1015.6066) mem 22339MB +[2024-07-25 09:45:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][70/625] eta 0:05:23 lr 0.000403 wd 0.0500 time 0.5622 (0.5824) data time 0.0006 (0.0058) model time 0.5615 (0.5751) loss 7.1918 (7.2623) grad_norm 2.1524 (2.5515) loss_scale 1024.0000 (1016.7887) mem 22339MB +[2024-07-25 09:45:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][80/625] eta 0:05:17 lr 0.000403 wd 0.0500 time 0.5713 (0.5821) data time 0.0008 (0.0052) model time 0.5705 (0.5764) loss 7.5849 (7.2813) grad_norm 2.2808 (2.5026) loss_scale 1024.0000 (1017.6790) mem 22339MB +[2024-07-25 09:45:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][90/625] eta 0:05:10 lr 0.000403 wd 0.0500 time 0.5724 (0.5812) data time 0.0006 (0.0047) model time 0.5719 (0.5756) loss 6.7936 (7.2795) grad_norm 2.1671 (2.5326) loss_scale 1024.0000 (1018.3736) mem 22339MB +[2024-07-25 09:45:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][100/625] eta 0:05:04 lr 0.000403 wd 0.0500 time 0.5771 (0.5805) data time 0.0006 (0.0043) model time 0.5764 (0.5752) loss 6.8276 (7.2910) grad_norm 2.3404 (2.5072) loss_scale 1024.0000 (1018.9307) mem 22339MB +[2024-07-25 09:45:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][110/625] eta 0:04:58 lr 0.000402 wd 0.0500 time 0.5644 (0.5799) data time 0.0008 (0.0040) model time 0.5637 (0.5749) loss 7.5436 (7.3000) grad_norm 2.1814 (2.4935) loss_scale 1024.0000 (1019.3874) mem 22339MB +[2024-07-25 09:45:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][120/625] eta 0:04:52 lr 0.000402 wd 0.0500 time 0.5735 (0.5795) data time 0.0008 (0.0037) model time 0.5728 (0.5748) loss 7.0749 (7.3018) grad_norm 1.9813 (2.4488) loss_scale 1024.0000 (1019.7686) mem 22339MB +[2024-07-25 09:45:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][130/625] eta 0:04:46 lr 0.000402 wd 0.0500 time 0.5604 (0.5792) data time 0.0008 (0.0036) model time 0.5596 (0.5746) loss 6.4583 (7.3074) grad_norm 1.7782 (2.4555) loss_scale 1024.0000 (1020.0916) mem 22339MB +[2024-07-25 09:45:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][140/625] eta 0:04:40 lr 0.000402 wd 0.0500 time 0.5729 (0.5792) data time 0.0009 (0.0034) model time 0.5720 (0.5750) loss 7.1551 (7.3283) grad_norm 1.8231 (2.4758) loss_scale 1024.0000 (1020.3688) mem 22339MB +[2024-07-25 09:46:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][150/625] eta 0:04:34 lr 0.000402 wd 0.0500 time 0.5707 (0.5788) data time 0.0006 (0.0032) model time 0.5701 (0.5748) loss 6.5023 (7.3352) grad_norm 2.2187 (2.4745) loss_scale 1024.0000 (1020.6093) mem 22339MB +[2024-07-25 09:46:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][160/625] eta 0:04:28 lr 0.000402 wd 0.0500 time 0.5683 (0.5784) data time 0.0008 (0.0031) model time 0.5675 (0.5745) loss 8.1425 (7.3519) grad_norm 2.1747 (2.4680) loss_scale 1024.0000 (1020.8199) mem 22339MB +[2024-07-25 09:46:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][170/625] eta 0:04:23 lr 0.000402 wd 0.0500 time 0.5735 (0.5798) data time 0.0006 (0.0029) model time 0.5728 (0.5768) loss 6.8409 (7.3460) grad_norm 3.6023 (2.4628) loss_scale 1024.0000 (1021.0058) mem 22339MB +[2024-07-25 09:46:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][180/625] eta 0:04:17 lr 0.000402 wd 0.0500 time 0.5706 (0.5795) data time 0.0006 (0.0028) model time 0.5700 (0.5765) loss 6.9819 (7.3479) grad_norm 2.6206 (2.4586) loss_scale 1024.0000 (1021.1713) mem 22339MB +[2024-07-25 09:46:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][190/625] eta 0:04:13 lr 0.000402 wd 0.0500 time 0.5711 (0.5818) data time 0.0006 (0.0027) model time 0.5705 (0.5798) loss 6.6484 (7.3474) grad_norm 2.0147 (2.4493) loss_scale 1024.0000 (1021.3194) mem 22339MB +[2024-07-25 09:46:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][200/625] eta 0:04:08 lr 0.000402 wd 0.0500 time 0.5679 (0.5846) data time 0.0006 (0.0026) model time 0.5673 (0.5837) loss 8.0727 (7.3612) grad_norm 1.8528 (2.4334) loss_scale 1024.0000 (1021.4527) mem 22339MB +[2024-07-25 09:46:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][210/625] eta 0:04:03 lr 0.000401 wd 0.0500 time 0.7496 (0.5871) data time 0.0006 (0.0025) model time 0.7490 (0.5869) loss 7.0847 (7.3573) grad_norm 2.8602 (2.4328) loss_scale 1024.0000 (1021.5735) mem 22339MB +[2024-07-25 09:46:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][220/625] eta 0:03:57 lr 0.000401 wd 0.0500 time 0.6208 (0.5867) data time 0.0007 (0.0024) model time 0.6202 (0.5863) loss 7.0745 (7.3781) grad_norm 2.8246 (2.4342) loss_scale 1024.0000 (1021.6833) mem 22339MB +[2024-07-25 09:46:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][230/625] eta 0:03:51 lr 0.000401 wd 0.0500 time 0.5697 (0.5864) data time 0.0006 (0.0024) model time 0.5691 (0.5859) loss 8.7369 (7.3720) grad_norm 3.4912 (2.4292) loss_scale 1024.0000 (1021.7835) mem 22339MB +[2024-07-25 09:46:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][240/625] eta 0:03:45 lr 0.000401 wd 0.0500 time 0.5756 (0.5858) data time 0.0007 (0.0023) model time 0.5749 (0.5852) loss 7.6713 (7.3692) grad_norm 2.2585 (2.4279) loss_scale 1024.0000 (1021.8755) mem 22339MB +[2024-07-25 09:47:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][250/625] eta 0:03:39 lr 0.000401 wd 0.0500 time 0.5778 (0.5854) data time 0.0008 (0.0023) model time 0.5770 (0.5847) loss 7.3726 (7.3800) grad_norm 2.9383 (2.4399) loss_scale 1024.0000 (1021.9602) mem 22339MB +[2024-07-25 09:47:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][260/625] eta 0:03:33 lr 0.000401 wd 0.0500 time 0.5743 (0.5850) data time 0.0008 (0.0022) model time 0.5735 (0.5841) loss 6.2745 (7.3768) grad_norm 1.9505 (2.4435) loss_scale 1024.0000 (1022.0383) mem 22339MB +[2024-07-25 09:47:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][270/625] eta 0:03:27 lr 0.000401 wd 0.0500 time 0.5700 (0.5847) data time 0.0008 (0.0022) model time 0.5692 (0.5838) loss 8.2360 (7.3841) grad_norm 2.7356 (2.4391) loss_scale 1024.0000 (1022.1107) mem 22339MB +[2024-07-25 09:47:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][280/625] eta 0:03:21 lr 0.000401 wd 0.0500 time 0.5730 (0.5844) data time 0.0007 (0.0021) model time 0.5723 (0.5834) loss 7.7517 (7.3741) grad_norm 2.7536 (2.4431) loss_scale 1024.0000 (1022.1779) mem 22339MB +[2024-07-25 09:47:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][290/625] eta 0:03:15 lr 0.000401 wd 0.0500 time 0.5612 (0.5846) data time 0.0008 (0.0021) model time 0.5604 (0.5837) loss 7.2032 (7.3677) grad_norm 2.4449 (2.4518) loss_scale 1024.0000 (1022.2405) mem 22339MB +[2024-07-25 09:47:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][300/625] eta 0:03:09 lr 0.000401 wd 0.0500 time 0.5743 (0.5843) data time 0.0007 (0.0020) model time 0.5736 (0.5833) loss 8.8507 (7.3851) grad_norm 2.4708 (2.4483) loss_scale 1024.0000 (1022.2990) mem 22339MB +[2024-07-25 09:47:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][310/625] eta 0:03:03 lr 0.000400 wd 0.0500 time 0.5750 (0.5840) data time 0.0008 (0.0020) model time 0.5742 (0.5829) loss 7.5036 (7.3846) grad_norm 1.6504 (2.4357) loss_scale 1024.0000 (1022.3537) mem 22339MB +[2024-07-25 09:47:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][320/625] eta 0:02:58 lr 0.000400 wd 0.0500 time 0.5765 (0.5837) data time 0.0008 (0.0020) model time 0.5756 (0.5826) loss 7.7644 (7.3890) grad_norm 1.9079 (2.4307) loss_scale 1024.0000 (1022.4050) mem 22339MB +[2024-07-25 09:47:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][330/625] eta 0:02:52 lr 0.000400 wd 0.0500 time 0.5714 (0.5834) data time 0.0005 (0.0019) model time 0.5709 (0.5822) loss 7.4411 (7.3903) grad_norm 4.7469 (2.4932) loss_scale 1024.0000 (1022.4532) mem 22339MB +[2024-07-25 09:47:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][340/625] eta 0:02:46 lr 0.000400 wd 0.0500 time 0.5738 (0.5831) data time 0.0006 (0.0019) model time 0.5731 (0.5819) loss 5.8197 (7.3864) grad_norm 4.6121 (2.4932) loss_scale 1024.0000 (1022.4985) mem 22339MB +[2024-07-25 09:47:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][350/625] eta 0:02:40 lr 0.000400 wd 0.0500 time 0.5752 (0.5829) data time 0.0008 (0.0019) model time 0.5744 (0.5817) loss 7.4397 (7.3824) grad_norm 1.9590 (2.5305) loss_scale 1024.0000 (1022.5413) mem 22339MB +[2024-07-25 09:48:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][360/625] eta 0:02:34 lr 0.000400 wd 0.0500 time 0.5208 (0.5831) data time 0.0009 (0.0018) model time 0.5199 (0.5819) loss 7.9853 (7.3838) grad_norm 3.6751 (2.5282) loss_scale 1024.0000 (1022.5817) mem 22339MB +[2024-07-25 09:48:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][370/625] eta 0:02:28 lr 0.000400 wd 0.0500 time 0.5718 (0.5829) data time 0.0009 (0.0018) model time 0.5709 (0.5817) loss 8.8017 (7.3891) grad_norm 2.4840 (2.5146) loss_scale 1024.0000 (1022.6199) mem 22339MB +[2024-07-25 09:48:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][380/625] eta 0:02:22 lr 0.000400 wd 0.0500 time 0.5696 (0.5826) data time 0.0007 (0.0018) model time 0.5689 (0.5814) loss 7.7929 (7.3960) grad_norm 3.2248 (2.5131) loss_scale 1024.0000 (1022.6562) mem 22339MB +[2024-07-25 09:48:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][390/625] eta 0:02:17 lr 0.000400 wd 0.0500 time 0.5626 (0.5832) data time 0.0007 (0.0018) model time 0.5619 (0.5821) loss 5.6797 (7.4053) grad_norm 1.8766 (2.5059) loss_scale 1024.0000 (1022.6905) mem 22339MB +[2024-07-25 09:48:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][400/625] eta 0:02:11 lr 0.000400 wd 0.0500 time 0.5715 (0.5831) data time 0.0008 (0.0018) model time 0.5708 (0.5820) loss 7.3723 (7.4021) grad_norm 2.4836 (2.5028) loss_scale 1024.0000 (1022.7232) mem 22339MB +[2024-07-25 09:48:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][410/625] eta 0:02:05 lr 0.000399 wd 0.0500 time 0.7462 (0.5845) data time 0.0006 (0.0017) model time 0.7456 (0.5835) loss 7.7414 (7.4034) grad_norm 3.2527 (2.5111) loss_scale 1024.0000 (1022.7543) mem 22339MB +[2024-07-25 09:48:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][420/625] eta 0:02:00 lr 0.000399 wd 0.0500 time 0.6401 (0.5856) data time 0.0006 (0.0017) model time 0.6395 (0.5848) loss 7.6566 (7.4096) grad_norm 5.2893 (2.5199) loss_scale 1024.0000 (1022.7838) mem 22339MB +[2024-07-25 09:48:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][430/625] eta 0:01:54 lr 0.000399 wd 0.0500 time 0.5742 (0.5865) data time 0.0009 (0.0017) model time 0.5733 (0.5859) loss 8.2803 (7.4068) grad_norm 3.3040 (2.5274) loss_scale 1024.0000 (1022.8121) mem 22339MB +[2024-07-25 09:48:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][440/625] eta 0:01:48 lr 0.000399 wd 0.0500 time 0.6587 (0.5868) data time 0.0006 (0.0017) model time 0.6581 (0.5861) loss 5.9769 (7.4001) grad_norm 2.3060 (2.5292) loss_scale 1024.0000 (1022.8390) mem 22339MB +[2024-07-25 09:48:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][450/625] eta 0:01:42 lr 0.000399 wd 0.0500 time 0.5717 (0.5868) data time 0.0006 (0.0017) model time 0.5710 (0.5861) loss 7.5123 (7.4085) grad_norm 1.7586 (2.5300) loss_scale 1024.0000 (1022.8647) mem 22339MB +[2024-07-25 09:49:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][460/625] eta 0:01:36 lr 0.000399 wd 0.0500 time 0.5701 (0.5865) data time 0.0008 (0.0016) model time 0.5692 (0.5858) loss 7.2715 (7.4044) grad_norm 2.5160 (2.5271) loss_scale 1024.0000 (1022.8894) mem 22339MB +[2024-07-25 09:49:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][470/625] eta 0:01:30 lr 0.000399 wd 0.0500 time 0.5646 (0.5862) data time 0.0008 (0.0016) model time 0.5638 (0.5855) loss 8.2073 (7.4057) grad_norm 2.0500 (2.5184) loss_scale 1024.0000 (1022.9130) mem 22339MB +[2024-07-25 09:49:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][480/625] eta 0:01:24 lr 0.000399 wd 0.0500 time 0.5676 (0.5860) data time 0.0008 (0.0016) model time 0.5668 (0.5852) loss 8.4384 (7.4009) grad_norm 2.5186 (2.5116) loss_scale 1024.0000 (1022.9356) mem 22339MB +[2024-07-25 09:49:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][490/625] eta 0:01:19 lr 0.000399 wd 0.0500 time 0.5683 (0.5857) data time 0.0010 (0.0016) model time 0.5673 (0.5849) loss 7.3894 (7.3998) grad_norm 1.6303 (2.5040) loss_scale 1024.0000 (1022.9572) mem 22339MB +[2024-07-25 09:49:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][500/625] eta 0:01:13 lr 0.000399 wd 0.0500 time 0.5698 (0.5855) data time 0.0008 (0.0016) model time 0.5690 (0.5847) loss 8.3598 (7.3989) grad_norm 1.9416 (2.4938) loss_scale 1024.0000 (1022.9780) mem 22339MB +[2024-07-25 09:49:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][510/625] eta 0:01:07 lr 0.000398 wd 0.0500 time 0.5726 (0.5853) data time 0.0006 (0.0016) model time 0.5720 (0.5844) loss 8.3469 (7.4002) grad_norm 1.5403 (2.4906) loss_scale 1024.0000 (1022.9980) mem 22339MB +[2024-07-25 09:49:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][520/625] eta 0:01:01 lr 0.000398 wd 0.0500 time 0.5731 (0.5850) data time 0.0005 (0.0016) model time 0.5726 (0.5842) loss 8.1154 (7.3989) grad_norm 2.4295 (2.5153) loss_scale 1024.0000 (1023.0173) mem 22339MB +[2024-07-25 09:49:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][530/625] eta 0:00:55 lr 0.000398 wd 0.0500 time 0.5736 (0.5848) data time 0.0008 (0.0016) model time 0.5728 (0.5839) loss 6.3741 (7.4036) grad_norm 2.8906 (2.5162) loss_scale 1024.0000 (1023.0358) mem 22339MB +[2024-07-25 09:49:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][540/625] eta 0:00:49 lr 0.000398 wd 0.0500 time 0.5705 (0.5847) data time 0.0006 (0.0015) model time 0.5699 (0.5837) loss 9.0652 (7.4106) grad_norm 2.3014 (2.5156) loss_scale 1024.0000 (1023.0536) mem 22339MB +[2024-07-25 09:49:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][550/625] eta 0:00:43 lr 0.000398 wd 0.0500 time 0.5742 (0.5845) data time 0.0006 (0.0015) model time 0.5735 (0.5836) loss 7.3177 (7.4115) grad_norm 1.6406 (2.5087) loss_scale 1024.0000 (1023.0708) mem 22339MB +[2024-07-25 09:50:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][560/625] eta 0:00:37 lr 0.000398 wd 0.0500 time 0.5744 (0.5843) data time 0.0008 (0.0015) model time 0.5736 (0.5834) loss 7.9280 (7.4100) grad_norm 3.0049 (2.5025) loss_scale 1024.0000 (1023.0873) mem 22339MB +[2024-07-25 09:50:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][570/625] eta 0:00:32 lr 0.000398 wd 0.0500 time 0.5707 (0.5842) data time 0.0006 (0.0015) model time 0.5700 (0.5832) loss 6.3173 (7.4079) grad_norm 2.1560 (2.4998) loss_scale 1024.0000 (1023.1033) mem 22339MB +[2024-07-25 09:50:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][580/625] eta 0:00:26 lr 0.000398 wd 0.0500 time 0.7111 (0.5842) data time 0.0006 (0.0015) model time 0.7105 (0.5833) loss 6.3806 (7.4104) grad_norm 1.9323 (2.4926) loss_scale 1024.0000 (1023.1188) mem 22339MB +[2024-07-25 09:50:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][590/625] eta 0:00:20 lr 0.000398 wd 0.0500 time 0.5710 (0.5841) data time 0.0008 (0.0015) model time 0.5702 (0.5831) loss 6.8790 (7.4086) grad_norm 2.0451 (2.4846) loss_scale 1024.0000 (1023.1337) mem 22339MB +[2024-07-25 09:50:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][600/625] eta 0:00:14 lr 0.000398 wd 0.0500 time 0.5737 (0.5839) data time 0.0007 (0.0015) model time 0.5729 (0.5829) loss 6.0262 (7.4062) grad_norm 2.6252 (2.4784) loss_scale 1024.0000 (1023.1481) mem 22339MB +[2024-07-25 09:50:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][610/625] eta 0:00:08 lr 0.000397 wd 0.0500 time 0.5729 (0.5840) data time 0.0006 (0.0015) model time 0.5724 (0.5830) loss 6.4003 (7.4055) grad_norm 2.3490 (2.4775) loss_scale 1024.0000 (1023.1620) mem 22339MB +[2024-07-25 09:50:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [191/300][620/625] eta 0:00:02 lr 0.000397 wd 0.0500 time 0.7013 (0.5841) data time 0.0006 (0.0015) model time 0.7008 (0.5831) loss 5.7065 (7.4047) grad_norm 2.3296 (2.4834) loss_scale 1024.0000 (1023.1755) mem 22339MB +[2024-07-25 09:50:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 191 training takes 0:06:05 +[2024-07-25 09:50:39 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 09:50:41 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 09:50:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.500 (0.500) Loss 0.5215 (0.5215) Acc@1 90.186 (90.186) Acc@5 98.730 (98.730) Mem 22339MB +[2024-07-25 09:50:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.160) Loss 0.8052 (0.6454) Acc@1 80.859 (86.923) Acc@5 96.387 (97.838) Mem 22339MB +[2024-07-25 09:50:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.9033 (0.7392) Acc@1 78.369 (84.094) Acc@5 95.215 (96.919) Mem 22339MB +[2024-07-25 09:50:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.687 Acc@5 96.891 +[2024-07-25 09:50:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.7% +[2024-07-25 09:50:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 83.69% +[2024-07-25 09:50:44 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 09:50:46 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 09:50:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.495 (0.495) Loss 0.5015 (0.5015) Acc@1 90.332 (90.332) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-25 09:50:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.159) Loss 0.7520 (0.6217) Acc@1 83.398 (87.305) Acc@5 96.533 (97.963) Mem 22339MB +[2024-07-25 09:50:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.8750 (0.7160) Acc@1 78.760 (84.335) Acc@5 95.703 (97.061) Mem 22339MB +[2024-07-25 09:50:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.941 Acc@5 97.051 +[2024-07-25 09:50:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 83.9% +[2024-07-25 09:50:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.94% +[2024-07-25 09:50:49 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 09:50:51 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 09:50:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][0/625] eta 0:08:59 lr 0.000397 wd 0.0500 time 0.8633 (0.8633) data time 0.3457 (0.3457) model time 0.0000 (0.0000) loss 6.7301 (6.7301) grad_norm 3.0957 (3.0957) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:50:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][10/625] eta 0:06:36 lr 0.000397 wd 0.0500 time 0.5694 (0.6445) data time 0.0008 (0.0322) model time 0.0000 (0.0000) loss 6.4950 (7.6182) grad_norm 1.9221 (2.6859) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:51:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][20/625] eta 0:06:26 lr 0.000397 wd 0.0500 time 0.6851 (0.6392) data time 0.0009 (0.0173) model time 0.0000 (0.0000) loss 9.1879 (7.7204) grad_norm 3.1096 (2.5711) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:51:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][30/625] eta 0:06:18 lr 0.000397 wd 0.0500 time 0.5686 (0.6361) data time 0.0008 (0.0120) model time 0.0000 (0.0000) loss 8.2193 (7.6625) grad_norm 1.7863 (2.5709) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:51:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][40/625] eta 0:06:05 lr 0.000397 wd 0.0500 time 0.5710 (0.6247) data time 0.0006 (0.0093) model time 0.0000 (0.0000) loss 7.1680 (7.6876) grad_norm 1.9385 (2.6184) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:51:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][50/625] eta 0:05:54 lr 0.000397 wd 0.0500 time 0.5757 (0.6168) data time 0.0006 (0.0076) model time 0.0000 (0.0000) loss 8.3090 (7.6829) grad_norm 1.8278 (2.8089) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:51:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][60/625] eta 0:05:44 lr 0.000397 wd 0.0500 time 0.5630 (0.6097) data time 0.0006 (0.0065) model time 0.5624 (0.5725) loss 8.6655 (7.6234) grad_norm 3.9034 (2.8126) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:51:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][70/625] eta 0:05:35 lr 0.000397 wd 0.0500 time 0.5739 (0.6047) data time 0.0008 (0.0057) model time 0.5731 (0.5730) loss 6.7601 (7.5781) grad_norm 2.2357 (2.7307) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:51:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][80/625] eta 0:05:27 lr 0.000396 wd 0.0500 time 0.5712 (0.6008) data time 0.0008 (0.0052) model time 0.5705 (0.5727) loss 8.2317 (7.5660) grad_norm 2.2340 (2.6849) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:51:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][90/625] eta 0:05:20 lr 0.000396 wd 0.0500 time 0.5748 (0.5994) data time 0.0006 (0.0047) model time 0.5742 (0.5763) loss 7.9283 (7.5165) grad_norm 1.8740 (2.6660) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:51:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][100/625] eta 0:05:13 lr 0.000396 wd 0.0500 time 0.5720 (0.5968) data time 0.0008 (0.0043) model time 0.5712 (0.5755) loss 8.7480 (7.4914) grad_norm 2.6540 (2.6414) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:51:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][110/625] eta 0:05:06 lr 0.000396 wd 0.0500 time 0.5719 (0.5948) data time 0.0008 (0.0040) model time 0.5711 (0.5753) loss 8.2096 (7.4654) grad_norm 1.7840 (2.6188) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:52:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][120/625] eta 0:04:59 lr 0.000396 wd 0.0500 time 0.5645 (0.5933) data time 0.0006 (0.0037) model time 0.5638 (0.5752) loss 7.0505 (7.4681) grad_norm 2.2369 (2.5881) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:52:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][130/625] eta 0:04:52 lr 0.000396 wd 0.0500 time 0.5727 (0.5919) data time 0.0006 (0.0035) model time 0.5721 (0.5751) loss 7.5457 (7.4680) grad_norm 1.8906 (2.5687) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:52:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][140/625] eta 0:04:46 lr 0.000396 wd 0.0500 time 0.5715 (0.5908) data time 0.0008 (0.0033) model time 0.5707 (0.5752) loss 6.2548 (7.4637) grad_norm 2.2271 (2.5655) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:52:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][150/625] eta 0:04:40 lr 0.000396 wd 0.0500 time 0.5729 (0.5898) data time 0.0006 (0.0032) model time 0.5723 (0.5751) loss 9.0548 (7.4839) grad_norm 2.4824 (2.5677) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:52:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][160/625] eta 0:04:33 lr 0.000396 wd 0.0500 time 0.5699 (0.5888) data time 0.0008 (0.0030) model time 0.5691 (0.5749) loss 7.4491 (7.4536) grad_norm 2.2050 (2.5535) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:52:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][170/625] eta 0:04:27 lr 0.000396 wd 0.0500 time 0.5738 (0.5879) data time 0.0006 (0.0029) model time 0.5732 (0.5748) loss 6.7192 (7.4393) grad_norm 1.8099 (2.5225) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:52:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][180/625] eta 0:04:21 lr 0.000395 wd 0.0500 time 0.5708 (0.5872) data time 0.0008 (0.0028) model time 0.5701 (0.5747) loss 8.9788 (7.4514) grad_norm 2.0547 (2.4995) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:52:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][190/625] eta 0:04:15 lr 0.000395 wd 0.0500 time 0.5746 (0.5865) data time 0.0008 (0.0027) model time 0.5738 (0.5745) loss 7.0804 (7.4287) grad_norm 2.3407 (2.4797) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:52:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][200/625] eta 0:04:09 lr 0.000395 wd 0.0500 time 0.5717 (0.5861) data time 0.0006 (0.0026) model time 0.5711 (0.5747) loss 5.9750 (7.4023) grad_norm 2.2396 (2.4660) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:52:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][210/625] eta 0:04:03 lr 0.000395 wd 0.0500 time 0.5727 (0.5858) data time 0.0008 (0.0025) model time 0.5719 (0.5751) loss 6.7045 (7.4148) grad_norm 1.7326 (2.4425) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:53:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][220/625] eta 0:03:57 lr 0.000395 wd 0.0500 time 0.7073 (0.5871) data time 0.0007 (0.0024) model time 0.7065 (0.5773) loss 6.2047 (7.4110) grad_norm 2.2046 (2.4521) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:53:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][230/625] eta 0:03:52 lr 0.000395 wd 0.0500 time 0.5701 (0.5894) data time 0.0008 (0.0024) model time 0.5693 (0.5807) loss 7.7911 (7.4199) grad_norm 3.5752 (2.4573) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:53:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][240/625] eta 0:03:47 lr 0.000395 wd 0.0500 time 0.7044 (0.5920) data time 0.0008 (0.0023) model time 0.7035 (0.5845) loss 7.6823 (7.4162) grad_norm 5.0464 (2.5236) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:53:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][250/625] eta 0:03:42 lr 0.000395 wd 0.0500 time 0.5615 (0.5925) data time 0.0006 (0.0023) model time 0.5609 (0.5854) loss 6.9628 (7.4135) grad_norm 1.7370 (2.5156) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:53:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][260/625] eta 0:03:36 lr 0.000395 wd 0.0500 time 0.5733 (0.5923) data time 0.0006 (0.0022) model time 0.5727 (0.5855) loss 6.0538 (7.4212) grad_norm 2.3564 (2.5060) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:53:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][270/625] eta 0:03:30 lr 0.000395 wd 0.0500 time 0.5751 (0.5922) data time 0.0006 (0.0022) model time 0.5745 (0.5856) loss 7.1000 (7.4197) grad_norm 1.9404 (2.5094) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:53:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][280/625] eta 0:03:24 lr 0.000394 wd 0.0500 time 0.5675 (0.5915) data time 0.0006 (0.0021) model time 0.5669 (0.5851) loss 7.6906 (7.4119) grad_norm 1.9245 (2.4890) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:53:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][290/625] eta 0:03:17 lr 0.000394 wd 0.0500 time 0.5721 (0.5909) data time 0.0006 (0.0021) model time 0.5715 (0.5846) loss 8.0563 (7.3962) grad_norm 2.3556 (2.4918) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:53:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][300/625] eta 0:03:12 lr 0.000394 wd 0.0500 time 0.5216 (0.5910) data time 0.0009 (0.0020) model time 0.5207 (0.5849) loss 7.0238 (7.4012) grad_norm 2.0262 (2.5000) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:53:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][310/625] eta 0:03:06 lr 0.000394 wd 0.0500 time 0.5686 (0.5906) data time 0.0006 (0.0020) model time 0.5680 (0.5846) loss 7.5428 (7.4025) grad_norm 2.8560 (2.5005) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:54:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][320/625] eta 0:02:59 lr 0.000394 wd 0.0500 time 0.5631 (0.5901) data time 0.0008 (0.0020) model time 0.5623 (0.5842) loss 7.5499 (7.3942) grad_norm 1.9242 (2.4891) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:54:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][330/625] eta 0:02:54 lr 0.000394 wd 0.0500 time 0.5740 (0.5900) data time 0.0007 (0.0019) model time 0.5733 (0.5842) loss 5.4801 (7.3871) grad_norm 2.0569 (2.4871) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:54:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][340/625] eta 0:02:48 lr 0.000394 wd 0.0500 time 0.5711 (0.5896) data time 0.0006 (0.0019) model time 0.5705 (0.5839) loss 8.0746 (7.3922) grad_norm 1.9714 (2.4739) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:54:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][350/625] eta 0:02:42 lr 0.000394 wd 0.0500 time 0.5731 (0.5892) data time 0.0006 (0.0019) model time 0.5725 (0.5836) loss 7.6525 (7.3939) grad_norm 2.5544 (2.4736) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:54:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][360/625] eta 0:02:36 lr 0.000394 wd 0.0500 time 0.5645 (0.5890) data time 0.0009 (0.0019) model time 0.5637 (0.5835) loss 6.2322 (7.3987) grad_norm 2.9442 (2.4850) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:54:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][370/625] eta 0:02:30 lr 0.000394 wd 0.0500 time 0.5719 (0.5886) data time 0.0008 (0.0018) model time 0.5712 (0.5832) loss 6.3502 (7.3881) grad_norm 2.5389 (2.4864) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:54:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][380/625] eta 0:02:24 lr 0.000393 wd 0.0500 time 0.5732 (0.5885) data time 0.0008 (0.0018) model time 0.5724 (0.5832) loss 7.4033 (7.3812) grad_norm 3.2680 (2.5119) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:54:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][390/625] eta 0:02:18 lr 0.000393 wd 0.0500 time 0.5699 (0.5882) data time 0.0008 (0.0018) model time 0.5691 (0.5830) loss 6.7736 (7.3780) grad_norm 2.7634 (2.5154) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:54:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][400/625] eta 0:02:12 lr 0.000393 wd 0.0500 time 0.5731 (0.5880) data time 0.0007 (0.0018) model time 0.5724 (0.5829) loss 8.8405 (7.3814) grad_norm 2.9167 (2.5206) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:54:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][410/625] eta 0:02:06 lr 0.000393 wd 0.0500 time 0.5736 (0.5877) data time 0.0006 (0.0018) model time 0.5730 (0.5826) loss 7.6019 (7.3768) grad_norm 3.3800 (2.5266) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:54:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][420/625] eta 0:02:00 lr 0.000393 wd 0.0500 time 0.5746 (0.5878) data time 0.0006 (0.0017) model time 0.5739 (0.5828) loss 7.7639 (7.3758) grad_norm 3.2873 (2.5542) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:55:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][430/625] eta 0:01:54 lr 0.000393 wd 0.0500 time 0.5704 (0.5878) data time 0.0006 (0.0017) model time 0.5697 (0.5829) loss 7.7869 (7.3851) grad_norm 2.0037 (2.5525) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:55:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][440/625] eta 0:01:48 lr 0.000393 wd 0.0500 time 0.5706 (0.5881) data time 0.0006 (0.0017) model time 0.5700 (0.5835) loss 6.2703 (7.3808) grad_norm 2.3932 (2.5528) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:55:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][450/625] eta 0:01:43 lr 0.000393 wd 0.0500 time 0.7606 (0.5892) data time 0.0006 (0.0017) model time 0.7600 (0.5847) loss 6.4125 (7.3774) grad_norm 2.1177 (2.5529) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:55:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][460/625] eta 0:01:37 lr 0.000393 wd 0.0500 time 0.7628 (0.5897) data time 0.0006 (0.0017) model time 0.7622 (0.5854) loss 6.2197 (7.3719) grad_norm 1.9259 (2.5447) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:55:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][470/625] eta 0:01:31 lr 0.000393 wd 0.0500 time 0.5724 (0.5908) data time 0.0006 (0.0017) model time 0.5718 (0.5868) loss 6.6144 (7.3732) grad_norm 2.3713 (2.5513) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:55:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][480/625] eta 0:01:25 lr 0.000392 wd 0.0500 time 0.5720 (0.5909) data time 0.0008 (0.0016) model time 0.5712 (0.5869) loss 7.3838 (7.3762) grad_norm 2.1354 (2.5538) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:55:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][490/625] eta 0:01:19 lr 0.000392 wd 0.0500 time 0.5667 (0.5909) data time 0.0008 (0.0016) model time 0.5659 (0.5869) loss 7.2575 (7.3727) grad_norm 1.8519 (2.5438) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:55:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][500/625] eta 0:01:13 lr 0.000392 wd 0.0500 time 0.5697 (0.5906) data time 0.0006 (0.0016) model time 0.5691 (0.5867) loss 7.8525 (7.3761) grad_norm 2.1946 (2.5392) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:55:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][510/625] eta 0:01:07 lr 0.000392 wd 0.0500 time 0.5724 (0.5902) data time 0.0008 (0.0016) model time 0.5716 (0.5864) loss 7.5632 (7.3708) grad_norm 2.0811 (2.5303) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:55:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][520/625] eta 0:01:01 lr 0.000392 wd 0.0500 time 0.5745 (0.5899) data time 0.0009 (0.0016) model time 0.5737 (0.5861) loss 8.2696 (7.3701) grad_norm 2.1331 (2.5390) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:56:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][530/625] eta 0:00:56 lr 0.000392 wd 0.0500 time 0.5683 (0.5899) data time 0.0006 (0.0016) model time 0.5677 (0.5861) loss 7.3631 (7.3703) grad_norm 2.6371 (2.5440) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:56:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][540/625] eta 0:00:50 lr 0.000392 wd 0.0500 time 0.5671 (0.5896) data time 0.0009 (0.0016) model time 0.5663 (0.5859) loss 6.8151 (7.3706) grad_norm 2.4720 (2.5379) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:56:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][550/625] eta 0:00:44 lr 0.000392 wd 0.0500 time 0.5645 (0.5894) data time 0.0008 (0.0016) model time 0.5636 (0.5857) loss 8.2585 (7.3724) grad_norm 2.5892 (2.5312) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:56:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][560/625] eta 0:00:38 lr 0.000392 wd 0.0500 time 0.5707 (0.5892) data time 0.0008 (0.0016) model time 0.5700 (0.5854) loss 8.1244 (7.3735) grad_norm 1.9115 (2.5218) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:56:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][570/625] eta 0:00:32 lr 0.000392 wd 0.0500 time 0.5731 (0.5890) data time 0.0006 (0.0016) model time 0.5725 (0.5853) loss 6.4083 (7.3746) grad_norm 1.8126 (2.5147) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:56:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][580/625] eta 0:00:26 lr 0.000392 wd 0.0500 time 0.5632 (0.5890) data time 0.0008 (0.0016) model time 0.5624 (0.5853) loss 8.3989 (7.3738) grad_norm 2.3376 (2.5062) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:56:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][590/625] eta 0:00:20 lr 0.000391 wd 0.0500 time 0.5686 (0.5888) data time 0.0006 (0.0015) model time 0.5680 (0.5851) loss 7.3557 (7.3721) grad_norm 2.3495 (2.5046) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:56:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][600/625] eta 0:00:14 lr 0.000391 wd 0.0500 time 0.5719 (0.5885) data time 0.0006 (0.0015) model time 0.5712 (0.5849) loss 6.9379 (7.3734) grad_norm 2.0497 (2.4979) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:56:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][610/625] eta 0:00:08 lr 0.000391 wd 0.0500 time 0.5741 (0.5883) data time 0.0006 (0.0015) model time 0.5735 (0.5848) loss 8.1091 (7.3725) grad_norm 1.6288 (2.4928) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:56:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [192/300][620/625] eta 0:00:02 lr 0.000391 wd 0.0500 time 0.5626 (0.5882) data time 0.0006 (0.0015) model time 0.5620 (0.5847) loss 7.2319 (7.3681) grad_norm 2.1947 (2.4927) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:56:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 192 training takes 0:06:07 +[2024-07-25 09:56:59 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 09:57:00 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 09:57:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.495 (0.495) Loss 0.5171 (0.5171) Acc@1 89.795 (89.795) Acc@5 98.828 (98.828) Mem 22339MB +[2024-07-25 09:57:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.159) Loss 0.7915 (0.6359) Acc@1 82.422 (86.981) Acc@5 96.533 (97.918) Mem 22339MB +[2024-07-25 09:57:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.144) Loss 0.8965 (0.7331) Acc@1 78.369 (84.084) Acc@5 95.654 (96.935) Mem 22339MB +[2024-07-25 09:57:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.773 Acc@5 96.903 +[2024-07-25 09:57:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.8% +[2024-07-25 09:57:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 83.77% +[2024-07-25 09:57:04 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 09:57:05 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 09:57:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.488 (0.488) Loss 0.5010 (0.5010) Acc@1 90.332 (90.332) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-25 09:57:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.7510 (0.6214) Acc@1 83.350 (87.340) Acc@5 96.533 (97.958) Mem 22339MB +[2024-07-25 09:57:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.143) Loss 0.8740 (0.7155) Acc@1 78.613 (84.380) Acc@5 95.850 (97.056) Mem 22339MB +[2024-07-25 09:57:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.991 Acc@5 97.051 +[2024-07-25 09:57:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.0% +[2024-07-25 09:57:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 83.99% +[2024-07-25 09:57:09 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 09:57:10 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 09:57:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][0/625] eta 0:09:16 lr 0.000391 wd 0.0500 time 0.8911 (0.8911) data time 0.3738 (0.3738) model time 0.0000 (0.0000) loss 8.2584 (8.2584) grad_norm 2.8904 (2.8904) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:57:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][10/625] eta 0:06:10 lr 0.000391 wd 0.0500 time 0.5719 (0.6017) data time 0.0008 (0.0348) model time 0.0000 (0.0000) loss 9.0551 (7.6503) grad_norm 3.9422 (2.8040) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:57:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][20/625] eta 0:06:10 lr 0.000391 wd 0.0500 time 0.7069 (0.6118) data time 0.0008 (0.0194) model time 0.0000 (0.0000) loss 7.3355 (7.4447) grad_norm 2.4499 (2.7150) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:57:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][30/625] eta 0:06:00 lr 0.000391 wd 0.0500 time 0.7110 (0.6055) data time 0.0006 (0.0135) model time 0.0000 (0.0000) loss 7.8394 (7.3734) grad_norm 1.7876 (2.5588) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:57:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][40/625] eta 0:05:56 lr 0.000391 wd 0.0500 time 0.7556 (0.6087) data time 0.0008 (0.0104) model time 0.0000 (0.0000) loss 7.3954 (7.4853) grad_norm 1.7229 (2.4522) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:57:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][50/625] eta 0:05:51 lr 0.000391 wd 0.0500 time 0.7793 (0.6115) data time 0.0008 (0.0085) model time 0.0000 (0.0000) loss 7.5078 (7.4724) grad_norm 2.7371 (2.3910) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:57:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][60/625] eta 0:05:47 lr 0.000390 wd 0.0500 time 0.6589 (0.6153) data time 0.0006 (0.0073) model time 0.6583 (0.6338) loss 7.6186 (7.4052) grad_norm 2.0108 (2.4162) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:57:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][70/625] eta 0:05:43 lr 0.000390 wd 0.0500 time 0.5721 (0.6185) data time 0.0008 (0.0064) model time 0.5713 (0.6357) loss 7.6810 (7.3953) grad_norm 2.3105 (2.4664) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:58:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][80/625] eta 0:05:35 lr 0.000390 wd 0.0500 time 0.5756 (0.6153) data time 0.0006 (0.0057) model time 0.5750 (0.6209) loss 8.6021 (7.4629) grad_norm 1.9159 (2.4537) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:58:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][90/625] eta 0:05:26 lr 0.000390 wd 0.0500 time 0.5708 (0.6109) data time 0.0008 (0.0052) model time 0.5700 (0.6092) loss 7.3767 (7.4512) grad_norm 2.5348 (2.4298) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:58:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][100/625] eta 0:05:18 lr 0.000390 wd 0.0500 time 0.5715 (0.6073) data time 0.0007 (0.0048) model time 0.5707 (0.6021) loss 7.7436 (7.4483) grad_norm 2.3158 (2.3948) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:58:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][110/625] eta 0:05:11 lr 0.000390 wd 0.0500 time 0.5711 (0.6044) data time 0.0006 (0.0044) model time 0.5705 (0.5974) loss 8.5721 (7.4586) grad_norm 2.5760 (2.3933) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:58:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][120/625] eta 0:05:03 lr 0.000390 wd 0.0500 time 0.5695 (0.6018) data time 0.0007 (0.0041) model time 0.5689 (0.5939) loss 5.8059 (7.4192) grad_norm 1.6783 (2.3992) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:58:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][130/625] eta 0:04:56 lr 0.000390 wd 0.0500 time 0.5702 (0.5997) data time 0.0008 (0.0039) model time 0.5695 (0.5914) loss 6.2669 (7.3853) grad_norm 2.1766 (2.3864) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:58:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][140/625] eta 0:04:50 lr 0.000390 wd 0.0500 time 0.5753 (0.5980) data time 0.0009 (0.0037) model time 0.5744 (0.5894) loss 8.5845 (7.3737) grad_norm 1.9960 (2.3774) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:58:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][150/625] eta 0:04:43 lr 0.000390 wd 0.0500 time 0.5740 (0.5965) data time 0.0007 (0.0035) model time 0.5733 (0.5880) loss 5.8779 (7.3610) grad_norm 1.4958 (2.3528) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:58:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][160/625] eta 0:04:36 lr 0.000389 wd 0.0500 time 0.5732 (0.5952) data time 0.0007 (0.0033) model time 0.5725 (0.5868) loss 7.7521 (7.3327) grad_norm 3.0501 (2.3330) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:58:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][170/625] eta 0:04:30 lr 0.000389 wd 0.0500 time 0.5750 (0.5941) data time 0.0006 (0.0032) model time 0.5744 (0.5859) loss 6.9588 (7.3452) grad_norm 2.5953 (2.3248) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:58:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][180/625] eta 0:04:23 lr 0.000389 wd 0.0500 time 0.5711 (0.5930) data time 0.0006 (0.0031) model time 0.5705 (0.5849) loss 7.4070 (7.3404) grad_norm 2.5128 (2.3276) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:59:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][190/625] eta 0:04:17 lr 0.000389 wd 0.0500 time 0.5754 (0.5921) data time 0.0009 (0.0029) model time 0.5745 (0.5841) loss 7.0995 (7.3357) grad_norm 2.3736 (2.3336) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:59:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][200/625] eta 0:04:11 lr 0.000389 wd 0.0500 time 0.5714 (0.5912) data time 0.0006 (0.0028) model time 0.5708 (0.5835) loss 6.7500 (7.3436) grad_norm 2.4985 (2.3400) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:59:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][210/625] eta 0:04:05 lr 0.000389 wd 0.0500 time 0.5740 (0.5906) data time 0.0007 (0.0027) model time 0.5733 (0.5830) loss 7.4727 (7.3615) grad_norm 3.2113 (2.3509) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:59:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][220/625] eta 0:03:58 lr 0.000389 wd 0.0500 time 0.5640 (0.5900) data time 0.0006 (0.0027) model time 0.5634 (0.5826) loss 7.8675 (7.3590) grad_norm 2.5971 (2.3667) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:59:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][230/625] eta 0:03:52 lr 0.000389 wd 0.0500 time 0.5713 (0.5894) data time 0.0006 (0.0026) model time 0.5707 (0.5822) loss 7.3603 (7.3802) grad_norm 1.9214 (2.3670) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:59:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][240/625] eta 0:03:47 lr 0.000389 wd 0.0500 time 0.5707 (0.5905) data time 0.0006 (0.0025) model time 0.5700 (0.5839) loss 6.0340 (7.4013) grad_norm 2.3086 (2.4320) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:59:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][250/625] eta 0:03:41 lr 0.000389 wd 0.0500 time 0.6991 (0.5904) data time 0.0007 (0.0024) model time 0.6984 (0.5841) loss 7.5305 (7.4013) grad_norm 2.4576 (2.4373) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:59:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][260/625] eta 0:03:35 lr 0.000388 wd 0.0500 time 0.5714 (0.5912) data time 0.0007 (0.0024) model time 0.5707 (0.5853) loss 6.1868 (7.3961) grad_norm 2.1361 (2.4712) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:59:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][270/625] eta 0:03:30 lr 0.000388 wd 0.0500 time 0.7037 (0.5933) data time 0.0007 (0.0023) model time 0.7030 (0.5882) loss 6.1517 (7.3886) grad_norm 1.5954 (2.4538) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 09:59:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][280/625] eta 0:03:24 lr 0.000388 wd 0.0500 time 0.5718 (0.5939) data time 0.0009 (0.0023) model time 0.5709 (0.5891) loss 6.4413 (7.3800) grad_norm 2.3981 (2.4508) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:00:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][290/625] eta 0:03:19 lr 0.000388 wd 0.0500 time 0.5727 (0.5956) data time 0.0008 (0.0022) model time 0.5720 (0.5913) loss 7.0286 (7.3638) grad_norm 2.1891 (2.4514) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:00:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][300/625] eta 0:03:13 lr 0.000388 wd 0.0500 time 0.7201 (0.5957) data time 0.0006 (0.0022) model time 0.7195 (0.5915) loss 8.1998 (7.3774) grad_norm 1.7360 (2.4394) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:00:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][310/625] eta 0:03:07 lr 0.000388 wd 0.0500 time 0.5734 (0.5950) data time 0.0007 (0.0021) model time 0.5728 (0.5909) loss 7.5276 (7.3811) grad_norm 2.3518 (2.4368) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:00:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][320/625] eta 0:03:01 lr 0.000388 wd 0.0500 time 0.5703 (0.5944) data time 0.0006 (0.0021) model time 0.5697 (0.5902) loss 6.9446 (7.3750) grad_norm 2.0755 (2.4710) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:00:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][330/625] eta 0:02:55 lr 0.000388 wd 0.0500 time 0.5717 (0.5937) data time 0.0008 (0.0021) model time 0.5708 (0.5896) loss 9.5684 (7.3757) grad_norm 2.2785 (2.4715) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:00:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][340/625] eta 0:02:49 lr 0.000388 wd 0.0500 time 0.5695 (0.5932) data time 0.0006 (0.0020) model time 0.5689 (0.5891) loss 7.0942 (7.3761) grad_norm 3.9048 (2.5025) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:00:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][350/625] eta 0:02:42 lr 0.000388 wd 0.0500 time 0.5670 (0.5926) data time 0.0007 (0.0020) model time 0.5663 (0.5885) loss 7.5867 (7.3733) grad_norm 2.3352 (2.5179) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:00:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][360/625] eta 0:02:36 lr 0.000387 wd 0.0500 time 0.5692 (0.5922) data time 0.0009 (0.0020) model time 0.5683 (0.5881) loss 8.2095 (7.3691) grad_norm 3.5698 (2.5186) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:00:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][370/625] eta 0:02:30 lr 0.000387 wd 0.0500 time 0.5727 (0.5917) data time 0.0007 (0.0019) model time 0.5720 (0.5877) loss 7.3408 (7.3746) grad_norm 1.6042 (2.5122) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:00:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][380/625] eta 0:02:24 lr 0.000387 wd 0.0500 time 0.5746 (0.5913) data time 0.0008 (0.0019) model time 0.5738 (0.5873) loss 7.1970 (7.3611) grad_norm 2.6004 (2.5031) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:01:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][390/625] eta 0:02:18 lr 0.000387 wd 0.0500 time 0.5743 (0.5909) data time 0.0006 (0.0019) model time 0.5737 (0.5869) loss 7.9299 (7.3646) grad_norm 3.6567 (2.5156) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:01:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][400/625] eta 0:02:12 lr 0.000387 wd 0.0500 time 0.5640 (0.5904) data time 0.0006 (0.0019) model time 0.5634 (0.5865) loss 8.4889 (7.3604) grad_norm 3.4810 (2.5302) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:01:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][410/625] eta 0:02:06 lr 0.000387 wd 0.0500 time 0.5721 (0.5900) data time 0.0009 (0.0018) model time 0.5712 (0.5861) loss 7.4521 (7.3654) grad_norm 1.9457 (2.5307) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:01:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][420/625] eta 0:02:00 lr 0.000387 wd 0.0500 time 0.5744 (0.5896) data time 0.0007 (0.0018) model time 0.5736 (0.5857) loss 7.6200 (7.3607) grad_norm 2.8192 (2.5273) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:01:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][430/625] eta 0:01:54 lr 0.000387 wd 0.0500 time 0.5734 (0.5893) data time 0.0008 (0.0018) model time 0.5726 (0.5854) loss 7.4035 (7.3528) grad_norm 1.8454 (2.5285) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:01:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][440/625] eta 0:01:48 lr 0.000387 wd 0.0500 time 0.5734 (0.5890) data time 0.0006 (0.0018) model time 0.5727 (0.5851) loss 7.4461 (7.3531) grad_norm 2.0697 (2.5332) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:01:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][450/625] eta 0:01:43 lr 0.000387 wd 0.0500 time 0.5736 (0.5887) data time 0.0006 (0.0018) model time 0.5730 (0.5849) loss 7.0010 (7.3477) grad_norm 2.3018 (2.5304) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:01:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][460/625] eta 0:01:37 lr 0.000386 wd 0.0500 time 0.5728 (0.5888) data time 0.0008 (0.0017) model time 0.5720 (0.5851) loss 7.6598 (7.3473) grad_norm 1.8741 (2.5287) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:01:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][470/625] eta 0:01:31 lr 0.000386 wd 0.0500 time 0.5728 (0.5889) data time 0.0008 (0.0017) model time 0.5721 (0.5852) loss 5.9963 (7.3426) grad_norm 2.7516 (2.5383) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:01:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][480/625] eta 0:01:25 lr 0.000386 wd 0.0500 time 0.5710 (0.5892) data time 0.0006 (0.0017) model time 0.5704 (0.5857) loss 7.0433 (7.3403) grad_norm 2.5500 (2.5417) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:02:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][490/625] eta 0:01:19 lr 0.000386 wd 0.0500 time 0.7134 (0.5907) data time 0.0008 (0.0017) model time 0.7126 (0.5875) loss 7.7762 (7.3487) grad_norm 1.5819 (2.5389) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:02:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][500/625] eta 0:01:13 lr 0.000386 wd 0.0500 time 0.5754 (0.5912) data time 0.0008 (0.0017) model time 0.5746 (0.5880) loss 5.8417 (7.3496) grad_norm 2.7397 (2.5537) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:02:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][510/625] eta 0:01:08 lr 0.000386 wd 0.0500 time 0.5621 (0.5915) data time 0.0007 (0.0016) model time 0.5614 (0.5884) loss 8.3489 (7.3491) grad_norm 2.5401 (2.5611) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:02:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][520/625] eta 0:01:02 lr 0.000386 wd 0.0500 time 0.5705 (0.5915) data time 0.0006 (0.0016) model time 0.5698 (0.5885) loss 7.7309 (7.3521) grad_norm 4.5327 (2.5623) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:02:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][530/625] eta 0:00:56 lr 0.000386 wd 0.0500 time 0.5733 (0.5913) data time 0.0008 (0.0016) model time 0.5725 (0.5882) loss 7.1948 (7.3558) grad_norm 2.6259 (2.5554) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:02:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][540/625] eta 0:00:50 lr 0.000386 wd 0.0500 time 0.5746 (0.5910) data time 0.0008 (0.0016) model time 0.5738 (0.5879) loss 8.1530 (7.3546) grad_norm 2.1877 (2.5480) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:02:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][550/625] eta 0:00:44 lr 0.000386 wd 0.0500 time 0.5721 (0.5907) data time 0.0006 (0.0016) model time 0.5715 (0.5877) loss 7.0230 (7.3557) grad_norm 2.3753 (2.5383) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:02:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][560/625] eta 0:00:38 lr 0.000386 wd 0.0500 time 0.5728 (0.5904) data time 0.0006 (0.0016) model time 0.5722 (0.5874) loss 7.2160 (7.3626) grad_norm 2.4866 (2.5345) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:02:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][570/625] eta 0:00:32 lr 0.000385 wd 0.0500 time 0.5743 (0.5901) data time 0.0006 (0.0016) model time 0.5738 (0.5871) loss 6.4201 (7.3591) grad_norm 1.9309 (2.5354) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:02:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][580/625] eta 0:00:26 lr 0.000385 wd 0.0500 time 0.5716 (0.5898) data time 0.0006 (0.0016) model time 0.5710 (0.5868) loss 7.6635 (7.3654) grad_norm 1.9982 (2.5363) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:02:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][590/625] eta 0:00:20 lr 0.000385 wd 0.0500 time 0.5748 (0.5896) data time 0.0008 (0.0015) model time 0.5740 (0.5866) loss 7.0833 (7.3624) grad_norm 2.2150 (2.5271) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:03:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][600/625] eta 0:00:14 lr 0.000385 wd 0.0500 time 0.5751 (0.5893) data time 0.0006 (0.0015) model time 0.5745 (0.5864) loss 7.8591 (7.3646) grad_norm 1.8183 (2.5234) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:03:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][610/625] eta 0:00:08 lr 0.000385 wd 0.0500 time 0.5671 (0.5891) data time 0.0006 (0.0015) model time 0.5665 (0.5861) loss 7.7974 (7.3698) grad_norm 1.9239 (2.5215) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:03:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [193/300][620/625] eta 0:00:02 lr 0.000385 wd 0.0500 time 0.5628 (0.5888) data time 0.0004 (0.0015) model time 0.5623 (0.5859) loss 7.0345 (7.3717) grad_norm 2.5403 (2.5210) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:03:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 193 training takes 0:06:07 +[2024-07-25 10:03:18 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 10:03:20 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 10:03:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.493 (0.493) Loss 0.4995 (0.4995) Acc@1 90.039 (90.039) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 10:03:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.159) Loss 0.7720 (0.6214) Acc@1 82.520 (87.069) Acc@5 96.484 (97.892) Mem 22339MB +[2024-07-25 10:03:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.8931 (0.7220) Acc@1 77.930 (84.159) Acc@5 95.459 (96.959) Mem 22339MB +[2024-07-25 10:03:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.815 Acc@5 96.963 +[2024-07-25 10:03:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.8% +[2024-07-25 10:03:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 83.82% +[2024-07-25 10:03:23 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 10:03:25 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 10:03:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.488 (0.488) Loss 0.5010 (0.5010) Acc@1 90.234 (90.234) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-25 10:03:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.7510 (0.6213) Acc@1 83.301 (87.322) Acc@5 96.484 (97.967) Mem 22339MB +[2024-07-25 10:03:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.143) Loss 0.8740 (0.7153) Acc@1 78.662 (84.380) Acc@5 95.850 (97.077) Mem 22339MB +[2024-07-25 10:03:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.987 Acc@5 97.069 +[2024-07-25 10:03:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.0% +[2024-07-25 10:03:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][0/625] eta 0:14:00 lr 0.000385 wd 0.0500 time 1.3447 (1.3447) data time 0.5092 (0.5092) model time 0.0000 (0.0000) loss 7.4181 (7.4181) grad_norm 2.1935 (2.1935) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:03:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][10/625] eta 0:06:36 lr 0.000385 wd 0.0500 time 0.5745 (0.6440) data time 0.0006 (0.0471) model time 0.0000 (0.0000) loss 7.1042 (7.1855) grad_norm 2.3985 (2.8660) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:03:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][20/625] eta 0:06:09 lr 0.000385 wd 0.0500 time 0.5753 (0.6116) data time 0.0006 (0.0251) model time 0.0000 (0.0000) loss 7.4417 (7.2266) grad_norm 2.1013 (2.6026) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:03:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][30/625] eta 0:05:56 lr 0.000385 wd 0.0500 time 0.5721 (0.5993) data time 0.0008 (0.0173) model time 0.0000 (0.0000) loss 7.8607 (7.3726) grad_norm 3.0377 (2.5624) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:03:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][40/625] eta 0:05:47 lr 0.000384 wd 0.0500 time 0.5723 (0.5934) data time 0.0006 (0.0133) model time 0.0000 (0.0000) loss 7.0447 (7.2729) grad_norm 1.7579 (2.5468) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:03:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][50/625] eta 0:05:38 lr 0.000384 wd 0.0500 time 0.5738 (0.5894) data time 0.0008 (0.0108) model time 0.0000 (0.0000) loss 6.7367 (7.3330) grad_norm 3.5305 (2.4763) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:04:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][60/625] eta 0:05:32 lr 0.000384 wd 0.0500 time 0.5696 (0.5885) data time 0.0008 (0.0092) model time 0.5688 (0.5832) loss 8.9133 (7.3425) grad_norm 1.8546 (2.4125) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:04:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][70/625] eta 0:05:27 lr 0.000384 wd 0.0500 time 0.7460 (0.5903) data time 0.0008 (0.0080) model time 0.7452 (0.5918) loss 6.3523 (7.3054) grad_norm 2.1737 (2.3835) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:04:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][80/625] eta 0:05:23 lr 0.000384 wd 0.0500 time 0.7388 (0.5937) data time 0.0007 (0.0071) model time 0.7381 (0.6003) loss 7.5651 (7.2712) grad_norm 1.7474 (2.3701) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:04:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][90/625] eta 0:05:18 lr 0.000384 wd 0.0500 time 0.5744 (0.5952) data time 0.0007 (0.0064) model time 0.5737 (0.6017) loss 7.7834 (7.2943) grad_norm 2.0568 (2.3408) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:04:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][100/625] eta 0:05:15 lr 0.000384 wd 0.0500 time 0.7580 (0.6007) data time 0.0006 (0.0059) model time 0.7574 (0.6114) loss 6.8853 (7.2609) grad_norm 2.9694 (2.3832) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:04:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][110/625] eta 0:05:09 lr 0.000384 wd 0.0500 time 0.6049 (0.6001) data time 0.0006 (0.0054) model time 0.6043 (0.6083) loss 7.0874 (7.2836) grad_norm 2.0033 (2.4524) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:04:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][120/625] eta 0:05:02 lr 0.000384 wd 0.0500 time 0.5775 (0.5987) data time 0.0008 (0.0051) model time 0.5768 (0.6046) loss 8.6596 (7.2997) grad_norm 2.2747 (2.4577) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:04:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][130/625] eta 0:04:55 lr 0.000384 wd 0.0500 time 0.5757 (0.5969) data time 0.0006 (0.0047) model time 0.5751 (0.6008) loss 8.6442 (7.2730) grad_norm 2.1789 (2.4593) loss_scale 2048.0000 (1063.0840) mem 22339MB +[2024-07-25 10:04:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][140/625] eta 0:04:48 lr 0.000383 wd 0.0500 time 0.5810 (0.5953) data time 0.0008 (0.0045) model time 0.5802 (0.5978) loss 9.0936 (7.2623) grad_norm 3.1283 (2.4578) loss_scale 2048.0000 (1132.9362) mem 22339MB +[2024-07-25 10:04:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][150/625] eta 0:04:42 lr 0.000383 wd 0.0500 time 0.5885 (0.5940) data time 0.0006 (0.0042) model time 0.5879 (0.5955) loss 7.6359 (7.2514) grad_norm 3.1855 (2.4415) loss_scale 2048.0000 (1193.5364) mem 22339MB +[2024-07-25 10:05:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][160/625] eta 0:04:35 lr 0.000383 wd 0.0500 time 0.5763 (0.5928) data time 0.0008 (0.0040) model time 0.5755 (0.5935) loss 7.0346 (7.2592) grad_norm 3.8163 (2.4505) loss_scale 2048.0000 (1246.6087) mem 22339MB +[2024-07-25 10:05:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][170/625] eta 0:04:29 lr 0.000383 wd 0.0500 time 0.5817 (0.5918) data time 0.0008 (0.0038) model time 0.5809 (0.5919) loss 8.1611 (7.2876) grad_norm 2.8819 (2.4930) loss_scale 2048.0000 (1293.4737) mem 22339MB +[2024-07-25 10:05:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][180/625] eta 0:04:22 lr 0.000383 wd 0.0500 time 0.5772 (0.5908) data time 0.0008 (0.0037) model time 0.5765 (0.5905) loss 7.5166 (7.2977) grad_norm 1.9117 (2.4841) loss_scale 2048.0000 (1335.1602) mem 22339MB +[2024-07-25 10:05:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][190/625] eta 0:04:16 lr 0.000383 wd 0.0500 time 0.5735 (0.5899) data time 0.0009 (0.0035) model time 0.5726 (0.5892) loss 8.5939 (7.3299) grad_norm 2.7487 (2.4576) loss_scale 2048.0000 (1372.4817) mem 22339MB +[2024-07-25 10:05:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][200/625] eta 0:04:10 lr 0.000383 wd 0.0500 time 0.5709 (0.5890) data time 0.0006 (0.0034) model time 0.5703 (0.5880) loss 7.8368 (7.3375) grad_norm 2.2422 (2.4497) loss_scale 2048.0000 (1406.0896) mem 22339MB +[2024-07-25 10:05:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][210/625] eta 0:04:04 lr 0.000383 wd 0.0500 time 0.5727 (0.5883) data time 0.0008 (0.0033) model time 0.5719 (0.5870) loss 8.6507 (7.3648) grad_norm 2.3641 (2.4380) loss_scale 2048.0000 (1436.5118) mem 22339MB +[2024-07-25 10:05:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][220/625] eta 0:03:57 lr 0.000383 wd 0.0500 time 0.5769 (0.5876) data time 0.0006 (0.0032) model time 0.5763 (0.5862) loss 7.5901 (7.3532) grad_norm 3.4568 (2.4371) loss_scale 2048.0000 (1464.1810) mem 22339MB +[2024-07-25 10:05:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][230/625] eta 0:03:52 lr 0.000383 wd 0.0500 time 0.5753 (0.5877) data time 0.0008 (0.0031) model time 0.5745 (0.5863) loss 8.3362 (7.3761) grad_norm 3.5643 (2.4397) loss_scale 2048.0000 (1489.4545) mem 22339MB +[2024-07-25 10:05:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][240/625] eta 0:03:46 lr 0.000382 wd 0.0500 time 0.5781 (0.5871) data time 0.0007 (0.0030) model time 0.5775 (0.5857) loss 6.4678 (7.3686) grad_norm 3.0400 (2.4661) loss_scale 2048.0000 (1512.6307) mem 22339MB +[2024-07-25 10:05:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][250/625] eta 0:03:40 lr 0.000382 wd 0.0500 time 0.5752 (0.5867) data time 0.0007 (0.0029) model time 0.5745 (0.5851) loss 8.7087 (7.3695) grad_norm 2.4812 (2.4894) loss_scale 2048.0000 (1533.9602) mem 22339MB +[2024-07-25 10:06:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][260/625] eta 0:03:33 lr 0.000382 wd 0.0500 time 0.5727 (0.5862) data time 0.0006 (0.0028) model time 0.5721 (0.5845) loss 8.0480 (7.3646) grad_norm 2.5823 (inf) loss_scale 1024.0000 (1518.3448) mem 22339MB +[2024-07-25 10:06:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][270/625] eta 0:03:27 lr 0.000382 wd 0.0500 time 0.5706 (0.5857) data time 0.0006 (0.0027) model time 0.5700 (0.5839) loss 8.2814 (7.3775) grad_norm 4.2704 (inf) loss_scale 1024.0000 (1500.1033) mem 22339MB +[2024-07-25 10:06:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][280/625] eta 0:03:22 lr 0.000382 wd 0.0500 time 0.5696 (0.5862) data time 0.0006 (0.0027) model time 0.5690 (0.5847) loss 6.8969 (7.3652) grad_norm 2.7380 (inf) loss_scale 1024.0000 (1483.1601) mem 22339MB +[2024-07-25 10:06:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][290/625] eta 0:03:16 lr 0.000382 wd 0.0500 time 0.5746 (0.5862) data time 0.0008 (0.0026) model time 0.5738 (0.5847) loss 8.2041 (7.3703) grad_norm 3.1919 (inf) loss_scale 1024.0000 (1467.3814) mem 22339MB +[2024-07-25 10:06:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][300/625] eta 0:03:10 lr 0.000382 wd 0.0500 time 0.7495 (0.5874) data time 0.0007 (0.0026) model time 0.7487 (0.5861) loss 5.7829 (7.3590) grad_norm 2.1454 (inf) loss_scale 1024.0000 (1452.6512) mem 22339MB +[2024-07-25 10:06:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][310/625] eta 0:03:05 lr 0.000382 wd 0.0500 time 0.6288 (0.5882) data time 0.0007 (0.0025) model time 0.6281 (0.5871) loss 5.8108 (7.3469) grad_norm 2.3750 (inf) loss_scale 1024.0000 (1438.8682) mem 22339MB +[2024-07-25 10:06:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][320/625] eta 0:03:00 lr 0.000382 wd 0.0500 time 0.7178 (0.5902) data time 0.0008 (0.0025) model time 0.7170 (0.5895) loss 8.3180 (7.3572) grad_norm 2.2229 (inf) loss_scale 1024.0000 (1425.9439) mem 22339MB +[2024-07-25 10:06:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][330/625] eta 0:02:54 lr 0.000382 wd 0.0500 time 0.6869 (0.5909) data time 0.0008 (0.0024) model time 0.6861 (0.5903) loss 7.1019 (7.3549) grad_norm 2.7760 (inf) loss_scale 1024.0000 (1413.8006) mem 22339MB +[2024-07-25 10:06:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][340/625] eta 0:02:48 lr 0.000381 wd 0.0500 time 0.5795 (0.5910) data time 0.0007 (0.0024) model time 0.5788 (0.5904) loss 7.5846 (7.3454) grad_norm 3.1625 (inf) loss_scale 1024.0000 (1402.3695) mem 22339MB +[2024-07-25 10:06:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][350/625] eta 0:02:42 lr 0.000381 wd 0.0500 time 0.5751 (0.5906) data time 0.0006 (0.0023) model time 0.5745 (0.5899) loss 8.4398 (7.3478) grad_norm 2.8934 (inf) loss_scale 1024.0000 (1391.5897) mem 22339MB +[2024-07-25 10:07:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][360/625] eta 0:02:36 lr 0.000381 wd 0.0500 time 0.5727 (0.5902) data time 0.0008 (0.0023) model time 0.5719 (0.5894) loss 7.3460 (7.3467) grad_norm 2.7190 (inf) loss_scale 1024.0000 (1381.4072) mem 22339MB +[2024-07-25 10:07:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][370/625] eta 0:02:30 lr 0.000381 wd 0.0500 time 0.5772 (0.5898) data time 0.0008 (0.0022) model time 0.5765 (0.5890) loss 7.1772 (7.3358) grad_norm 3.6299 (inf) loss_scale 1024.0000 (1371.7736) mem 22339MB +[2024-07-25 10:07:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][380/625] eta 0:02:24 lr 0.000381 wd 0.0500 time 0.5746 (0.5894) data time 0.0006 (0.0022) model time 0.5740 (0.5885) loss 8.1584 (7.3431) grad_norm 3.3278 (inf) loss_scale 1024.0000 (1362.6457) mem 22339MB +[2024-07-25 10:07:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][390/625] eta 0:02:18 lr 0.000381 wd 0.0500 time 0.5742 (0.5890) data time 0.0006 (0.0022) model time 0.5736 (0.5881) loss 8.3465 (7.3394) grad_norm 2.2453 (inf) loss_scale 1024.0000 (1353.9847) mem 22339MB +[2024-07-25 10:07:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][400/625] eta 0:02:12 lr 0.000381 wd 0.0500 time 0.5737 (0.5887) data time 0.0009 (0.0021) model time 0.5728 (0.5877) loss 7.4470 (7.3444) grad_norm 2.1094 (inf) loss_scale 1024.0000 (1345.7556) mem 22339MB +[2024-07-25 10:07:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][410/625] eta 0:02:06 lr 0.000381 wd 0.0500 time 0.5756 (0.5883) data time 0.0008 (0.0021) model time 0.5748 (0.5873) loss 8.3539 (7.3501) grad_norm 2.4307 (inf) loss_scale 1024.0000 (1337.9270) mem 22339MB +[2024-07-25 10:07:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][420/625] eta 0:02:00 lr 0.000381 wd 0.0500 time 0.5732 (0.5880) data time 0.0006 (0.0021) model time 0.5727 (0.5870) loss 6.3488 (7.3602) grad_norm 2.7869 (inf) loss_scale 1024.0000 (1330.4703) mem 22339MB +[2024-07-25 10:07:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][430/625] eta 0:01:54 lr 0.000381 wd 0.0500 time 0.5732 (0.5877) data time 0.0006 (0.0021) model time 0.5726 (0.5866) loss 8.1497 (7.3704) grad_norm 2.9008 (inf) loss_scale 1024.0000 (1323.3596) mem 22339MB +[2024-07-25 10:07:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][440/625] eta 0:01:48 lr 0.000381 wd 0.0500 time 0.5750 (0.5874) data time 0.0008 (0.0021) model time 0.5742 (0.5863) loss 6.7975 (7.3699) grad_norm 2.2769 (inf) loss_scale 1024.0000 (1316.5714) mem 22339MB +[2024-07-25 10:07:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][450/625] eta 0:01:42 lr 0.000380 wd 0.0500 time 0.6797 (0.5874) data time 0.0006 (0.0021) model time 0.6791 (0.5862) loss 8.9995 (7.3784) grad_norm 2.3918 (inf) loss_scale 1024.0000 (1310.0843) mem 22339MB +[2024-07-25 10:07:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][460/625] eta 0:01:36 lr 0.000380 wd 0.0500 time 0.6539 (0.5873) data time 0.0006 (0.0020) model time 0.6533 (0.5861) loss 8.4151 (7.3862) grad_norm 2.4459 (inf) loss_scale 1024.0000 (1303.8785) mem 22339MB +[2024-07-25 10:08:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][470/625] eta 0:01:30 lr 0.000380 wd 0.0500 time 0.5755 (0.5870) data time 0.0006 (0.0020) model time 0.5749 (0.5858) loss 6.6096 (7.3863) grad_norm 1.8692 (inf) loss_scale 1024.0000 (1297.9363) mem 22339MB +[2024-07-25 10:08:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][480/625] eta 0:01:25 lr 0.000380 wd 0.0500 time 0.5759 (0.5868) data time 0.0008 (0.0020) model time 0.5752 (0.5855) loss 6.8470 (7.3824) grad_norm 1.9329 (inf) loss_scale 1024.0000 (1292.2412) mem 22339MB +[2024-07-25 10:08:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][490/625] eta 0:01:19 lr 0.000380 wd 0.0500 time 0.5785 (0.5865) data time 0.0007 (0.0020) model time 0.5778 (0.5853) loss 7.0617 (7.3824) grad_norm 1.8487 (inf) loss_scale 1024.0000 (1286.7780) mem 22339MB +[2024-07-25 10:08:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][500/625] eta 0:01:13 lr 0.000380 wd 0.0500 time 0.5789 (0.5866) data time 0.0008 (0.0019) model time 0.5781 (0.5854) loss 8.0762 (7.3888) grad_norm 3.7862 (inf) loss_scale 1024.0000 (1281.5329) mem 22339MB +[2024-07-25 10:08:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][510/625] eta 0:01:07 lr 0.000380 wd 0.0500 time 0.7299 (0.5870) data time 0.0008 (0.0019) model time 0.7291 (0.5858) loss 8.5490 (7.3982) grad_norm 2.2086 (inf) loss_scale 1024.0000 (1276.4932) mem 22339MB +[2024-07-25 10:08:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][520/625] eta 0:01:01 lr 0.000380 wd 0.0500 time 0.7618 (0.5878) data time 0.0006 (0.0019) model time 0.7612 (0.5867) loss 6.6979 (7.4005) grad_norm 2.7910 (inf) loss_scale 1024.0000 (1271.6468) mem 22339MB +[2024-07-25 10:08:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][530/625] eta 0:00:55 lr 0.000380 wd 0.0500 time 0.7564 (0.5887) data time 0.0008 (0.0019) model time 0.7557 (0.5877) loss 7.0154 (7.3981) grad_norm 2.5230 (inf) loss_scale 1024.0000 (1266.9831) mem 22339MB +[2024-07-25 10:08:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][540/625] eta 0:00:50 lr 0.000380 wd 0.0500 time 0.5766 (0.5892) data time 0.0008 (0.0019) model time 0.5758 (0.5882) loss 7.7674 (7.4068) grad_norm 2.8785 (inf) loss_scale 1024.0000 (1262.4917) mem 22339MB +[2024-07-25 10:08:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][550/625] eta 0:00:44 lr 0.000379 wd 0.0500 time 0.5745 (0.5900) data time 0.0008 (0.0019) model time 0.5738 (0.5891) loss 8.1488 (7.4087) grad_norm 1.9541 (inf) loss_scale 1024.0000 (1258.1633) mem 22339MB +[2024-07-25 10:08:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][560/625] eta 0:00:38 lr 0.000379 wd 0.0500 time 0.5742 (0.5898) data time 0.0008 (0.0018) model time 0.5733 (0.5889) loss 7.5879 (7.4088) grad_norm 18.2811 (inf) loss_scale 1024.0000 (1253.9893) mem 22339MB +[2024-07-25 10:09:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][570/625] eta 0:00:32 lr 0.000379 wd 0.0500 time 0.5764 (0.5895) data time 0.0008 (0.0018) model time 0.5756 (0.5886) loss 6.9101 (7.4133) grad_norm 2.3255 (inf) loss_scale 1024.0000 (1249.9615) mem 22339MB +[2024-07-25 10:09:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][580/625] eta 0:00:26 lr 0.000379 wd 0.0500 time 0.5758 (0.5892) data time 0.0006 (0.0018) model time 0.5752 (0.5883) loss 7.4532 (7.4190) grad_norm 2.4552 (inf) loss_scale 1024.0000 (1246.0723) mem 22339MB +[2024-07-25 10:09:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][590/625] eta 0:00:20 lr 0.000379 wd 0.0500 time 0.5858 (0.5890) data time 0.0008 (0.0018) model time 0.5850 (0.5881) loss 5.7233 (7.4136) grad_norm 3.9040 (inf) loss_scale 1024.0000 (1242.3147) mem 22339MB +[2024-07-25 10:09:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][600/625] eta 0:00:14 lr 0.000379 wd 0.0500 time 0.5722 (0.5888) data time 0.0009 (0.0018) model time 0.5713 (0.5878) loss 8.6773 (7.4059) grad_norm 5.8119 (inf) loss_scale 1024.0000 (1238.6822) mem 22339MB +[2024-07-25 10:09:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][610/625] eta 0:00:08 lr 0.000379 wd 0.0500 time 0.5765 (0.5886) data time 0.0006 (0.0018) model time 0.5760 (0.5876) loss 7.6229 (7.4105) grad_norm 4.6745 (inf) loss_scale 1024.0000 (1235.1686) mem 22339MB +[2024-07-25 10:09:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [194/300][620/625] eta 0:00:02 lr 0.000379 wd 0.0500 time 0.5767 (0.5884) data time 0.0006 (0.0017) model time 0.5761 (0.5873) loss 7.8658 (7.4074) grad_norm 2.6718 (inf) loss_scale 1024.0000 (1231.7681) mem 22339MB +[2024-07-25 10:09:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 194 training takes 0:06:07 +[2024-07-25 10:09:36 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 10:09:37 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 10:09:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.481 (0.481) Loss 0.5137 (0.5137) Acc@1 89.551 (89.551) Acc@5 98.828 (98.828) Mem 22339MB +[2024-07-25 10:09:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.8018 (0.6284) Acc@1 82.031 (86.981) Acc@5 96.289 (97.900) Mem 22339MB +[2024-07-25 10:09:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.143) Loss 0.8770 (0.7255) Acc@1 78.809 (84.140) Acc@5 95.898 (96.959) Mem 22339MB +[2024-07-25 10:09:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.757 Acc@5 96.947 +[2024-07-25 10:09:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.8% +[2024-07-25 10:09:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.815 (0.815) Loss 0.5010 (0.5010) Acc@1 90.186 (90.186) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-25 10:09:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.190) Loss 0.7515 (0.6210) Acc@1 83.301 (87.327) Acc@5 96.387 (97.971) Mem 22339MB +[2024-07-25 10:09:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.159) Loss 0.8730 (0.7149) Acc@1 78.662 (84.394) Acc@5 95.801 (97.084) Mem 22339MB +[2024-07-25 10:09:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.999 Acc@5 97.077 +[2024-07-25 10:09:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.0% +[2024-07-25 10:09:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.00% +[2024-07-25 10:09:45 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 10:09:46 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 10:09:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][0/625] eta 0:09:41 lr 0.000379 wd 0.0500 time 0.9305 (0.9305) data time 0.4094 (0.4094) model time 0.0000 (0.0000) loss 7.8979 (7.8979) grad_norm 2.2023 (2.2023) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:09:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][10/625] eta 0:06:13 lr 0.000379 wd 0.0500 time 0.5713 (0.6072) data time 0.0006 (0.0380) model time 0.0000 (0.0000) loss 7.5020 (7.5392) grad_norm 1.8018 (2.9404) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:09:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][20/625] eta 0:05:57 lr 0.000378 wd 0.0500 time 0.5716 (0.5911) data time 0.0006 (0.0203) model time 0.0000 (0.0000) loss 7.4869 (7.1263) grad_norm 6.6637 (3.0930) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:10:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][30/625] eta 0:05:48 lr 0.000378 wd 0.0500 time 0.5723 (0.5857) data time 0.0006 (0.0141) model time 0.0000 (0.0000) loss 7.7349 (7.0625) grad_norm 6.6776 (3.3097) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:10:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][40/625] eta 0:05:41 lr 0.000378 wd 0.0500 time 0.5698 (0.5833) data time 0.0008 (0.0108) model time 0.0000 (0.0000) loss 8.2737 (7.2265) grad_norm 1.9725 (3.2033) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:10:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][50/625] eta 0:05:34 lr 0.000378 wd 0.0500 time 0.5730 (0.5814) data time 0.0008 (0.0089) model time 0.0000 (0.0000) loss 6.9642 (7.2009) grad_norm 2.2260 (3.0641) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:10:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][60/625] eta 0:05:27 lr 0.000378 wd 0.0500 time 0.5745 (0.5804) data time 0.0006 (0.0076) model time 0.5739 (0.5739) loss 7.4873 (7.1970) grad_norm 2.0162 (2.8950) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:10:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][70/625] eta 0:05:21 lr 0.000378 wd 0.0500 time 0.5722 (0.5794) data time 0.0008 (0.0066) model time 0.5714 (0.5732) loss 8.3313 (7.2340) grad_norm 2.0960 (2.8119) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:10:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][80/625] eta 0:05:15 lr 0.000378 wd 0.0500 time 0.5659 (0.5787) data time 0.0008 (0.0059) model time 0.5650 (0.5732) loss 6.7591 (7.2940) grad_norm 1.7938 (2.7690) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:10:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][90/625] eta 0:05:10 lr 0.000378 wd 0.0500 time 0.7256 (0.5800) data time 0.0006 (0.0054) model time 0.7250 (0.5773) loss 7.2472 (7.2625) grad_norm 1.9200 (2.7645) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:10:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][100/625] eta 0:05:04 lr 0.000378 wd 0.0500 time 0.5741 (0.5795) data time 0.0008 (0.0049) model time 0.5733 (0.5765) loss 7.5710 (7.3149) grad_norm 3.8894 (2.7267) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:10:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][110/625] eta 0:04:58 lr 0.000378 wd 0.0500 time 0.5677 (0.5804) data time 0.0008 (0.0046) model time 0.5668 (0.5787) loss 7.5542 (7.2604) grad_norm 1.6626 (2.6934) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:10:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][120/625] eta 0:04:56 lr 0.000378 wd 0.0500 time 0.7156 (0.5874) data time 0.0006 (0.0043) model time 0.7149 (0.5908) loss 7.6729 (7.2492) grad_norm 2.8773 (2.6966) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:11:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][130/625] eta 0:04:52 lr 0.000377 wd 0.0500 time 0.7275 (0.5919) data time 0.0007 (0.0040) model time 0.7268 (0.5977) loss 8.2978 (7.2773) grad_norm 2.7492 (2.6735) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:11:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][140/625] eta 0:04:48 lr 0.000377 wd 0.0500 time 0.7601 (0.5948) data time 0.0007 (0.0038) model time 0.7594 (0.6015) loss 7.9539 (7.3044) grad_norm 2.5785 (2.6517) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:11:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][150/625] eta 0:04:42 lr 0.000377 wd 0.0500 time 0.6257 (0.5940) data time 0.0006 (0.0036) model time 0.6250 (0.5996) loss 5.9223 (7.2835) grad_norm 1.9504 (2.6153) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:11:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][160/625] eta 0:04:35 lr 0.000377 wd 0.0500 time 0.5680 (0.5928) data time 0.0006 (0.0034) model time 0.5674 (0.5971) loss 7.4873 (7.2811) grad_norm 1.9542 (2.5905) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:11:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][170/625] eta 0:04:29 lr 0.000377 wd 0.0500 time 0.5743 (0.5917) data time 0.0007 (0.0033) model time 0.5735 (0.5951) loss 8.3127 (7.3008) grad_norm 2.3132 (2.6224) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:11:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][180/625] eta 0:04:22 lr 0.000377 wd 0.0500 time 0.5734 (0.5907) data time 0.0008 (0.0031) model time 0.5726 (0.5935) loss 7.9434 (7.3208) grad_norm 2.3484 (2.5977) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:11:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][190/625] eta 0:04:16 lr 0.000377 wd 0.0500 time 0.5706 (0.5899) data time 0.0006 (0.0030) model time 0.5700 (0.5921) loss 7.6780 (7.3079) grad_norm 2.4394 (2.5831) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:11:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][200/625] eta 0:04:10 lr 0.000377 wd 0.0500 time 0.5753 (0.5892) data time 0.0006 (0.0029) model time 0.5747 (0.5909) loss 6.7125 (7.3005) grad_norm 2.3575 (2.5813) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:11:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][210/625] eta 0:04:04 lr 0.000377 wd 0.0500 time 0.5742 (0.5885) data time 0.0007 (0.0028) model time 0.5734 (0.5899) loss 6.9405 (7.2896) grad_norm 3.1432 (2.5700) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:11:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][220/625] eta 0:03:58 lr 0.000377 wd 0.0500 time 0.5700 (0.5887) data time 0.0009 (0.0027) model time 0.5691 (0.5900) loss 8.3293 (7.2943) grad_norm 2.3561 (2.6217) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:12:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][230/625] eta 0:03:52 lr 0.000376 wd 0.0500 time 0.5520 (0.5881) data time 0.0007 (0.0026) model time 0.5513 (0.5891) loss 6.5572 (7.3096) grad_norm 1.7526 (2.6329) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:12:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][240/625] eta 0:03:46 lr 0.000376 wd 0.0500 time 0.5685 (0.5875) data time 0.0007 (0.0026) model time 0.5678 (0.5882) loss 8.4866 (7.3144) grad_norm 2.1556 (2.6217) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:12:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][250/625] eta 0:03:40 lr 0.000376 wd 0.0500 time 0.5675 (0.5870) data time 0.0009 (0.0025) model time 0.5666 (0.5875) loss 7.3923 (7.3102) grad_norm 1.8607 (2.5982) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:12:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][260/625] eta 0:03:34 lr 0.000376 wd 0.0500 time 0.5753 (0.5865) data time 0.0008 (0.0025) model time 0.5745 (0.5869) loss 8.2635 (7.3203) grad_norm 1.9724 (2.6125) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:12:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][270/625] eta 0:03:28 lr 0.000376 wd 0.0500 time 0.5741 (0.5861) data time 0.0006 (0.0024) model time 0.5735 (0.5863) loss 7.1096 (7.3153) grad_norm 2.3004 (2.6024) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:12:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][280/625] eta 0:03:22 lr 0.000376 wd 0.0500 time 0.5752 (0.5858) data time 0.0007 (0.0023) model time 0.5745 (0.5859) loss 7.7258 (7.3175) grad_norm 1.8869 (2.6063) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:12:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][290/625] eta 0:03:16 lr 0.000376 wd 0.0500 time 0.5713 (0.5855) data time 0.0008 (0.0023) model time 0.5705 (0.5854) loss 6.1621 (7.3268) grad_norm 3.0058 (2.6118) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:12:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][300/625] eta 0:03:10 lr 0.000376 wd 0.0500 time 0.5729 (0.5852) data time 0.0006 (0.0022) model time 0.5723 (0.5850) loss 6.8178 (7.3305) grad_norm 2.5312 (2.6127) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:12:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][310/625] eta 0:03:04 lr 0.000376 wd 0.0500 time 0.5641 (0.5849) data time 0.0008 (0.0022) model time 0.5633 (0.5847) loss 7.4910 (7.3254) grad_norm 2.0939 (2.6144) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:12:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][320/625] eta 0:02:58 lr 0.000376 wd 0.0500 time 0.5713 (0.5849) data time 0.0008 (0.0022) model time 0.5705 (0.5847) loss 8.0032 (7.3428) grad_norm 1.7219 (2.6047) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:13:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][330/625] eta 0:02:52 lr 0.000375 wd 0.0500 time 0.7640 (0.5860) data time 0.0008 (0.0021) model time 0.7632 (0.5859) loss 7.2629 (7.3410) grad_norm 2.0647 (2.6023) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:13:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][340/625] eta 0:02:47 lr 0.000375 wd 0.0500 time 0.6994 (0.5870) data time 0.0007 (0.0021) model time 0.6988 (0.5871) loss 6.9682 (7.3322) grad_norm 2.0330 (2.5981) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:13:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][350/625] eta 0:02:41 lr 0.000375 wd 0.0500 time 0.7195 (0.5884) data time 0.0008 (0.0021) model time 0.7187 (0.5886) loss 8.2618 (7.3240) grad_norm 2.6416 (2.5939) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:13:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][360/625] eta 0:02:36 lr 0.000375 wd 0.0500 time 0.7043 (0.5893) data time 0.0008 (0.0020) model time 0.7035 (0.5897) loss 6.7951 (7.3160) grad_norm 1.9348 (2.6095) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:13:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][370/625] eta 0:02:30 lr 0.000375 wd 0.0500 time 0.5705 (0.5894) data time 0.0008 (0.0020) model time 0.5697 (0.5898) loss 7.4788 (7.3275) grad_norm 3.7581 (2.6073) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:13:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][380/625] eta 0:02:24 lr 0.000375 wd 0.0500 time 0.5695 (0.5892) data time 0.0006 (0.0020) model time 0.5689 (0.5894) loss 7.0583 (7.3280) grad_norm 2.1542 (2.6071) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:13:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][390/625] eta 0:02:18 lr 0.000375 wd 0.0500 time 0.5721 (0.5888) data time 0.0009 (0.0019) model time 0.5712 (0.5890) loss 8.3672 (7.3316) grad_norm 2.1916 (2.6083) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:13:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][400/625] eta 0:02:12 lr 0.000375 wd 0.0500 time 0.5739 (0.5884) data time 0.0007 (0.0019) model time 0.5733 (0.5885) loss 8.0186 (7.3374) grad_norm 2.6477 (2.6173) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:13:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][410/625] eta 0:02:06 lr 0.000375 wd 0.0500 time 0.5749 (0.5881) data time 0.0009 (0.0019) model time 0.5740 (0.5882) loss 8.0101 (7.3290) grad_norm 2.7936 (2.6100) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:13:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][420/625] eta 0:02:00 lr 0.000375 wd 0.0500 time 0.5711 (0.5878) data time 0.0006 (0.0019) model time 0.5704 (0.5877) loss 7.4678 (7.3430) grad_norm 3.2403 (2.6083) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:14:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][430/625] eta 0:01:54 lr 0.000374 wd 0.0500 time 0.5724 (0.5875) data time 0.0007 (0.0018) model time 0.5717 (0.5874) loss 6.5974 (7.3481) grad_norm 2.7263 (2.6071) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:14:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][440/625] eta 0:01:48 lr 0.000374 wd 0.0500 time 0.5680 (0.5875) data time 0.0006 (0.0018) model time 0.5674 (0.5874) loss 6.8453 (7.3453) grad_norm 2.3099 (2.6070) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:14:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][450/625] eta 0:01:42 lr 0.000374 wd 0.0500 time 0.5745 (0.5872) data time 0.0006 (0.0018) model time 0.5739 (0.5870) loss 9.0661 (7.3505) grad_norm 2.1024 (2.5938) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:14:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][460/625] eta 0:01:36 lr 0.000374 wd 0.0500 time 0.5735 (0.5869) data time 0.0008 (0.0018) model time 0.5727 (0.5867) loss 8.1309 (7.3567) grad_norm 2.1338 (2.5903) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:14:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][470/625] eta 0:01:30 lr 0.000374 wd 0.0500 time 0.5695 (0.5866) data time 0.0006 (0.0018) model time 0.5688 (0.5863) loss 7.1602 (7.3525) grad_norm 2.0287 (inf) loss_scale 512.0000 (1017.4777) mem 22339MB +[2024-07-25 10:14:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][480/625] eta 0:01:25 lr 0.000374 wd 0.0500 time 0.5723 (0.5864) data time 0.0008 (0.0017) model time 0.5715 (0.5861) loss 7.8511 (7.3578) grad_norm 1.8054 (inf) loss_scale 512.0000 (1006.9688) mem 22339MB +[2024-07-25 10:14:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][490/625] eta 0:01:19 lr 0.000374 wd 0.0500 time 0.5639 (0.5861) data time 0.0008 (0.0017) model time 0.5631 (0.5858) loss 7.2520 (7.3496) grad_norm 1.7275 (inf) loss_scale 512.0000 (996.8880) mem 22339MB +[2024-07-25 10:14:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][500/625] eta 0:01:13 lr 0.000374 wd 0.0500 time 0.5733 (0.5859) data time 0.0006 (0.0017) model time 0.5727 (0.5856) loss 6.7062 (7.3497) grad_norm 1.8092 (inf) loss_scale 512.0000 (987.2096) mem 22339MB +[2024-07-25 10:14:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][510/625] eta 0:01:07 lr 0.000374 wd 0.0500 time 0.5737 (0.5857) data time 0.0007 (0.0017) model time 0.5730 (0.5853) loss 6.7891 (7.3512) grad_norm 3.6565 (inf) loss_scale 512.0000 (977.9100) mem 22339MB +[2024-07-25 10:14:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][520/625] eta 0:01:01 lr 0.000374 wd 0.0500 time 0.5713 (0.5855) data time 0.0008 (0.0017) model time 0.5706 (0.5851) loss 8.3780 (7.3563) grad_norm 3.4895 (inf) loss_scale 512.0000 (968.9674) mem 22339MB +[2024-07-25 10:14:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][530/625] eta 0:00:55 lr 0.000373 wd 0.0500 time 0.5734 (0.5853) data time 0.0009 (0.0017) model time 0.5725 (0.5848) loss 8.1517 (7.3527) grad_norm 2.4030 (inf) loss_scale 512.0000 (960.3616) mem 22339MB +[2024-07-25 10:15:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][540/625] eta 0:00:49 lr 0.000373 wd 0.0500 time 0.5741 (0.5855) data time 0.0007 (0.0016) model time 0.5734 (0.5850) loss 7.0311 (7.3602) grad_norm 2.2077 (inf) loss_scale 512.0000 (952.0739) mem 22339MB +[2024-07-25 10:15:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][550/625] eta 0:00:43 lr 0.000373 wd 0.0500 time 0.7204 (0.5859) data time 0.0008 (0.0016) model time 0.7196 (0.5855) loss 8.2928 (7.3611) grad_norm 2.9724 (inf) loss_scale 512.0000 (944.0871) mem 22339MB +[2024-07-25 10:15:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][560/625] eta 0:00:38 lr 0.000373 wd 0.0500 time 0.7459 (0.5873) data time 0.0006 (0.0016) model time 0.7453 (0.5870) loss 8.0356 (7.3622) grad_norm 2.5751 (inf) loss_scale 512.0000 (936.3850) mem 22339MB +[2024-07-25 10:15:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][570/625] eta 0:00:32 lr 0.000373 wd 0.0500 time 0.6936 (0.5877) data time 0.0008 (0.0016) model time 0.6927 (0.5875) loss 7.5484 (7.3643) grad_norm 5.4608 (inf) loss_scale 512.0000 (928.9527) mem 22339MB +[2024-07-25 10:15:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][580/625] eta 0:00:26 lr 0.000373 wd 0.0500 time 0.7647 (0.5891) data time 0.0007 (0.0016) model time 0.7640 (0.5890) loss 6.2086 (7.3644) grad_norm 2.6314 (inf) loss_scale 512.0000 (921.7762) mem 22339MB +[2024-07-25 10:15:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][590/625] eta 0:00:20 lr 0.000373 wd 0.0500 time 0.5722 (0.5891) data time 0.0008 (0.0016) model time 0.5714 (0.5889) loss 6.5200 (7.3653) grad_norm 1.7695 (inf) loss_scale 512.0000 (914.8426) mem 22339MB +[2024-07-25 10:15:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][600/625] eta 0:00:14 lr 0.000373 wd 0.0500 time 0.5733 (0.5889) data time 0.0010 (0.0016) model time 0.5722 (0.5887) loss 8.2617 (7.3635) grad_norm 1.5878 (inf) loss_scale 512.0000 (908.1398) mem 22339MB +[2024-07-25 10:15:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][610/625] eta 0:00:08 lr 0.000373 wd 0.0500 time 0.5693 (0.5887) data time 0.0004 (0.0016) model time 0.5689 (0.5884) loss 7.9729 (7.3668) grad_norm 2.8167 (inf) loss_scale 512.0000 (901.6563) mem 22339MB +[2024-07-25 10:15:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [195/300][620/625] eta 0:00:02 lr 0.000373 wd 0.0500 time 0.5687 (0.5884) data time 0.0006 (0.0016) model time 0.5681 (0.5881) loss 8.1388 (7.3693) grad_norm 2.3514 (inf) loss_scale 512.0000 (895.3816) mem 22339MB +[2024-07-25 10:15:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 195 training takes 0:06:07 +[2024-07-25 10:15:54 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 10:15:56 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 10:15:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.489 (0.489) Loss 0.5098 (0.5098) Acc@1 90.381 (90.381) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-25 10:15:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.159) Loss 0.7998 (0.6322) Acc@1 81.689 (87.109) Acc@5 96.289 (97.843) Mem 22339MB +[2024-07-25 10:15:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.143) Loss 0.9023 (0.7286) Acc@1 78.076 (84.189) Acc@5 95.703 (96.989) Mem 22339MB +[2024-07-25 10:15:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.859 Acc@5 96.993 +[2024-07-25 10:15:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.9% +[2024-07-25 10:15:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 83.86% +[2024-07-25 10:15:59 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 10:16:01 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 10:16:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.475 (0.475) Loss 0.5015 (0.5015) Acc@1 90.186 (90.186) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-25 10:16:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.7520 (0.6211) Acc@1 83.398 (87.362) Acc@5 96.436 (97.967) Mem 22339MB +[2024-07-25 10:16:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.8721 (0.7147) Acc@1 78.711 (84.438) Acc@5 95.898 (97.089) Mem 22339MB +[2024-07-25 10:16:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.033 Acc@5 97.081 +[2024-07-25 10:16:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.0% +[2024-07-25 10:16:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.03% +[2024-07-25 10:16:04 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 10:16:06 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 10:16:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][0/625] eta 0:09:24 lr 0.000373 wd 0.0500 time 0.9030 (0.9030) data time 0.3871 (0.3871) model time 0.0000 (0.0000) loss 7.6382 (7.6382) grad_norm 1.6679 (1.6679) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:16:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][10/625] eta 0:06:11 lr 0.000372 wd 0.0500 time 0.5621 (0.6037) data time 0.0006 (0.0359) model time 0.0000 (0.0000) loss 6.6691 (7.3169) grad_norm 2.3813 (2.1520) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:16:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][20/625] eta 0:05:56 lr 0.000372 wd 0.0500 time 0.5704 (0.5890) data time 0.0006 (0.0192) model time 0.0000 (0.0000) loss 6.4720 (7.2924) grad_norm 2.4183 (2.0220) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:16:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][30/625] eta 0:05:47 lr 0.000372 wd 0.0500 time 0.5666 (0.5839) data time 0.0007 (0.0133) model time 0.0000 (0.0000) loss 7.4714 (7.3854) grad_norm 2.4589 (2.0947) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:16:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][40/625] eta 0:05:40 lr 0.000372 wd 0.0500 time 0.5702 (0.5816) data time 0.0007 (0.0102) model time 0.0000 (0.0000) loss 8.6179 (7.3672) grad_norm 2.6641 (2.1485) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:16:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][50/625] eta 0:05:33 lr 0.000372 wd 0.0500 time 0.5722 (0.5802) data time 0.0007 (0.0084) model time 0.0000 (0.0000) loss 9.2107 (7.4158) grad_norm 1.7661 (2.2332) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:16:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][60/625] eta 0:05:27 lr 0.000372 wd 0.0500 time 0.5721 (0.5792) data time 0.0006 (0.0072) model time 0.5715 (0.5733) loss 6.4189 (7.3751) grad_norm 3.0850 (2.3747) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:16:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][70/625] eta 0:05:21 lr 0.000372 wd 0.0500 time 0.5725 (0.5786) data time 0.0006 (0.0063) model time 0.5719 (0.5736) loss 6.1628 (7.3542) grad_norm 2.4458 (2.3436) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:16:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][80/625] eta 0:05:15 lr 0.000372 wd 0.0500 time 0.5798 (0.5781) data time 0.0007 (0.0056) model time 0.5791 (0.5739) loss 8.3662 (7.3107) grad_norm 2.3069 (2.3071) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:16:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][90/625] eta 0:05:09 lr 0.000372 wd 0.0500 time 0.5740 (0.5778) data time 0.0006 (0.0051) model time 0.5734 (0.5739) loss 5.8677 (7.3119) grad_norm 1.6054 (2.3107) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:17:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][100/625] eta 0:05:03 lr 0.000372 wd 0.0500 time 0.5614 (0.5774) data time 0.0006 (0.0047) model time 0.5608 (0.5738) loss 7.7748 (7.2945) grad_norm 3.1486 (2.3154) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:17:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][110/625] eta 0:04:57 lr 0.000371 wd 0.0500 time 0.5758 (0.5773) data time 0.0006 (0.0043) model time 0.5752 (0.5740) loss 7.7765 (7.2858) grad_norm 2.0671 (2.3266) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:17:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][120/625] eta 0:04:51 lr 0.000371 wd 0.0500 time 0.5623 (0.5771) data time 0.0006 (0.0040) model time 0.5617 (0.5739) loss 7.1748 (7.3199) grad_norm 2.8002 (2.3897) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:17:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][130/625] eta 0:04:45 lr 0.000371 wd 0.0500 time 0.5606 (0.5774) data time 0.0008 (0.0038) model time 0.5597 (0.5749) loss 7.9297 (7.3050) grad_norm 2.2957 (2.3905) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:17:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][140/625] eta 0:04:40 lr 0.000371 wd 0.0500 time 0.5716 (0.5786) data time 0.0008 (0.0036) model time 0.5708 (0.5769) loss 7.1390 (7.3106) grad_norm 2.4888 (2.3836) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:17:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][150/625] eta 0:04:35 lr 0.000371 wd 0.0500 time 0.5675 (0.5811) data time 0.0008 (0.0034) model time 0.5667 (0.5806) loss 6.9250 (7.3164) grad_norm 2.1557 (2.3744) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:17:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][160/625] eta 0:04:33 lr 0.000371 wd 0.0500 time 0.7229 (0.5875) data time 0.0006 (0.0032) model time 0.7224 (0.5900) loss 8.2999 (7.3230) grad_norm 2.4960 (2.3655) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:17:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][170/625] eta 0:04:28 lr 0.000371 wd 0.0500 time 0.5573 (0.5898) data time 0.0008 (0.0031) model time 0.5565 (0.5931) loss 6.8368 (7.3194) grad_norm 2.0520 (2.3516) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:17:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][180/625] eta 0:04:22 lr 0.000371 wd 0.0500 time 0.5725 (0.5903) data time 0.0006 (0.0030) model time 0.5719 (0.5935) loss 8.0170 (7.3275) grad_norm 4.0036 (2.3751) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:17:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][190/625] eta 0:04:16 lr 0.000371 wd 0.0500 time 0.5754 (0.5906) data time 0.0008 (0.0029) model time 0.5746 (0.5935) loss 7.5859 (7.3234) grad_norm 2.1054 (2.3752) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:18:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][200/625] eta 0:04:10 lr 0.000371 wd 0.0500 time 0.5699 (0.5901) data time 0.0007 (0.0028) model time 0.5693 (0.5927) loss 5.8016 (7.2971) grad_norm 2.1883 (2.3682) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:18:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][210/625] eta 0:04:04 lr 0.000370 wd 0.0500 time 0.5726 (0.5895) data time 0.0006 (0.0027) model time 0.5720 (0.5916) loss 5.9397 (7.3039) grad_norm 2.3487 (2.3564) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:18:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][220/625] eta 0:03:58 lr 0.000370 wd 0.0500 time 0.5609 (0.5888) data time 0.0006 (0.0026) model time 0.5603 (0.5906) loss 6.4119 (7.2921) grad_norm 2.3658 (2.3582) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:18:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][230/625] eta 0:03:52 lr 0.000370 wd 0.0500 time 0.5729 (0.5884) data time 0.0007 (0.0025) model time 0.5722 (0.5899) loss 7.1527 (7.3036) grad_norm 3.3353 (2.3720) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:18:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][240/625] eta 0:03:46 lr 0.000370 wd 0.0500 time 0.5737 (0.5879) data time 0.0006 (0.0024) model time 0.5731 (0.5892) loss 6.7369 (7.3030) grad_norm 1.7963 (2.3782) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:18:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][250/625] eta 0:03:40 lr 0.000370 wd 0.0500 time 0.5728 (0.5874) data time 0.0007 (0.0024) model time 0.5721 (0.5884) loss 6.1883 (7.2984) grad_norm 1.9149 (2.3816) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:18:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][260/625] eta 0:03:34 lr 0.000370 wd 0.0500 time 0.5737 (0.5871) data time 0.0008 (0.0023) model time 0.5728 (0.5879) loss 8.0470 (7.2952) grad_norm 1.9488 (2.3683) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:18:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][270/625] eta 0:03:28 lr 0.000370 wd 0.0500 time 0.5745 (0.5866) data time 0.0006 (0.0023) model time 0.5739 (0.5872) loss 7.6835 (7.2940) grad_norm 1.7098 (2.3880) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:18:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][280/625] eta 0:03:22 lr 0.000370 wd 0.0500 time 0.5726 (0.5862) data time 0.0006 (0.0022) model time 0.5721 (0.5867) loss 7.8022 (7.3040) grad_norm 1.8579 (2.3837) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:18:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][290/625] eta 0:03:16 lr 0.000370 wd 0.0500 time 0.5638 (0.5858) data time 0.0006 (0.0022) model time 0.5632 (0.5862) loss 6.7721 (7.3129) grad_norm 2.4984 (2.3783) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:19:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][300/625] eta 0:03:10 lr 0.000370 wd 0.0500 time 0.5750 (0.5855) data time 0.0008 (0.0021) model time 0.5742 (0.5857) loss 6.0051 (7.2965) grad_norm 2.8964 (2.3750) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:19:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][310/625] eta 0:03:04 lr 0.000370 wd 0.0500 time 0.5727 (0.5851) data time 0.0008 (0.0021) model time 0.5720 (0.5852) loss 7.8250 (7.3066) grad_norm 3.1630 (2.4393) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:19:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][320/625] eta 0:02:58 lr 0.000369 wd 0.0500 time 0.5743 (0.5848) data time 0.0006 (0.0020) model time 0.5738 (0.5848) loss 7.2363 (7.2994) grad_norm 3.2725 (2.4491) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:19:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][330/625] eta 0:02:52 lr 0.000369 wd 0.0500 time 0.5739 (0.5845) data time 0.0006 (0.0020) model time 0.5734 (0.5844) loss 6.5196 (7.3069) grad_norm 1.8019 (2.4739) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:19:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][340/625] eta 0:02:46 lr 0.000369 wd 0.0500 time 0.5701 (0.5842) data time 0.0008 (0.0020) model time 0.5693 (0.5840) loss 7.8066 (7.3100) grad_norm 1.7345 (2.4635) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:19:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][350/625] eta 0:02:40 lr 0.000369 wd 0.0500 time 0.6434 (0.5843) data time 0.0008 (0.0019) model time 0.6426 (0.5841) loss 7.9562 (7.3090) grad_norm 2.1057 (2.4668) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:19:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][360/625] eta 0:02:34 lr 0.000369 wd 0.0500 time 0.5693 (0.5840) data time 0.0006 (0.0019) model time 0.5686 (0.5838) loss 7.4751 (7.2988) grad_norm 3.3500 (2.4660) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:19:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][370/625] eta 0:02:29 lr 0.000369 wd 0.0500 time 0.5741 (0.5846) data time 0.0008 (0.0019) model time 0.5734 (0.5845) loss 6.2436 (7.2901) grad_norm 2.8081 (2.4696) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:19:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][380/625] eta 0:02:23 lr 0.000369 wd 0.0500 time 0.5736 (0.5856) data time 0.0008 (0.0019) model time 0.5728 (0.5856) loss 8.2434 (7.2915) grad_norm 3.5714 (2.4674) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:19:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][390/625] eta 0:02:18 lr 0.000369 wd 0.0500 time 0.6949 (0.5879) data time 0.0007 (0.0018) model time 0.6942 (0.5882) loss 8.3710 (7.2897) grad_norm 2.9839 (2.4762) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:20:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][400/625] eta 0:02:12 lr 0.000369 wd 0.0500 time 0.5692 (0.5881) data time 0.0008 (0.0018) model time 0.5684 (0.5884) loss 7.8259 (7.2886) grad_norm 2.4930 (2.4791) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:20:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][410/625] eta 0:02:06 lr 0.000369 wd 0.0500 time 0.5712 (0.5882) data time 0.0007 (0.0018) model time 0.5706 (0.5885) loss 7.9960 (7.2969) grad_norm 1.8234 (2.4739) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:20:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][420/625] eta 0:02:00 lr 0.000368 wd 0.0500 time 0.5702 (0.5878) data time 0.0008 (0.0018) model time 0.5694 (0.5881) loss 7.8249 (7.2995) grad_norm 2.6306 (2.5049) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:20:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][430/625] eta 0:01:54 lr 0.000368 wd 0.0500 time 0.5764 (0.5875) data time 0.0008 (0.0017) model time 0.5756 (0.5877) loss 6.5864 (7.2976) grad_norm 1.6440 (2.4976) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:20:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][440/625] eta 0:01:48 lr 0.000368 wd 0.0500 time 0.5727 (0.5873) data time 0.0006 (0.0017) model time 0.5721 (0.5874) loss 6.0409 (7.3034) grad_norm 1.9553 (2.4913) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:20:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][450/625] eta 0:01:42 lr 0.000368 wd 0.0500 time 0.5720 (0.5870) data time 0.0008 (0.0017) model time 0.5712 (0.5870) loss 8.2248 (7.3065) grad_norm 1.8603 (2.4780) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:20:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][460/625] eta 0:01:36 lr 0.000368 wd 0.0500 time 0.5652 (0.5867) data time 0.0007 (0.0017) model time 0.5645 (0.5867) loss 6.0680 (7.3080) grad_norm 2.1455 (2.4678) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:20:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][470/625] eta 0:01:30 lr 0.000368 wd 0.0500 time 0.5756 (0.5865) data time 0.0006 (0.0017) model time 0.5750 (0.5864) loss 7.5059 (7.3172) grad_norm 1.6380 (2.4602) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:20:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][480/625] eta 0:01:25 lr 0.000368 wd 0.0500 time 0.5706 (0.5862) data time 0.0008 (0.0017) model time 0.5698 (0.5861) loss 6.7650 (7.3157) grad_norm 1.8083 (2.4498) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:20:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][490/625] eta 0:01:19 lr 0.000368 wd 0.0500 time 0.5706 (0.5860) data time 0.0006 (0.0016) model time 0.5700 (0.5858) loss 8.3621 (7.3145) grad_norm 2.7790 (2.4464) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:20:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][500/625] eta 0:01:13 lr 0.000368 wd 0.0500 time 0.5723 (0.5858) data time 0.0008 (0.0016) model time 0.5715 (0.5855) loss 7.4897 (7.3160) grad_norm 1.9242 (2.4442) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:21:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][510/625] eta 0:01:07 lr 0.000368 wd 0.0500 time 0.5716 (0.5855) data time 0.0008 (0.0016) model time 0.5708 (0.5853) loss 8.0389 (7.3108) grad_norm 1.7739 (2.4354) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:21:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][520/625] eta 0:01:01 lr 0.000367 wd 0.0500 time 0.5729 (0.5853) data time 0.0008 (0.0016) model time 0.5721 (0.5850) loss 7.9616 (7.3187) grad_norm 2.2655 (2.4289) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:21:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][530/625] eta 0:00:55 lr 0.000367 wd 0.0500 time 0.5731 (0.5851) data time 0.0007 (0.0016) model time 0.5724 (0.5848) loss 6.1739 (7.3246) grad_norm 1.6173 (2.4301) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:21:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][540/625] eta 0:00:49 lr 0.000367 wd 0.0500 time 0.5722 (0.5849) data time 0.0007 (0.0016) model time 0.5715 (0.5846) loss 7.8593 (7.3279) grad_norm 2.3187 (2.4308) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:21:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][550/625] eta 0:00:43 lr 0.000367 wd 0.0500 time 0.5753 (0.5848) data time 0.0007 (0.0016) model time 0.5746 (0.5844) loss 7.1288 (7.3322) grad_norm 1.4862 (2.4276) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:21:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][560/625] eta 0:00:37 lr 0.000367 wd 0.0500 time 0.5704 (0.5846) data time 0.0006 (0.0015) model time 0.5698 (0.5841) loss 8.7200 (7.3413) grad_norm 2.0405 (2.4229) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:21:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][570/625] eta 0:00:32 lr 0.000367 wd 0.0500 time 0.7309 (0.5847) data time 0.0006 (0.0015) model time 0.7303 (0.5842) loss 8.1335 (7.3410) grad_norm 2.9474 (2.4261) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:21:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][580/625] eta 0:00:26 lr 0.000367 wd 0.0500 time 0.7426 (0.5848) data time 0.0008 (0.0015) model time 0.7418 (0.5844) loss 8.5899 (7.3444) grad_norm 2.4253 (2.4245) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:21:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][590/625] eta 0:00:20 lr 0.000367 wd 0.0500 time 0.5738 (0.5851) data time 0.0008 (0.0015) model time 0.5730 (0.5847) loss 8.5536 (7.3537) grad_norm 2.8078 (2.4186) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:21:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][600/625] eta 0:00:14 lr 0.000367 wd 0.0500 time 0.7045 (0.5860) data time 0.0006 (0.0015) model time 0.7039 (0.5857) loss 8.3539 (7.3579) grad_norm 1.9158 (2.4120) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:22:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][610/625] eta 0:00:08 lr 0.000367 wd 0.0500 time 0.7057 (0.5873) data time 0.0006 (0.0015) model time 0.7051 (0.5871) loss 6.2495 (7.3566) grad_norm 3.1563 (2.4124) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:22:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [196/300][620/625] eta 0:00:02 lr 0.000366 wd 0.0500 time 0.5684 (0.5879) data time 0.0004 (0.0015) model time 0.5681 (0.5877) loss 6.2523 (7.3562) grad_norm 1.6546 (2.4057) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:22:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 196 training takes 0:06:07 +[2024-07-25 10:22:13 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 10:22:15 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 10:22:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.477 (0.477) Loss 0.5088 (0.5088) Acc@1 89.941 (89.941) Acc@5 98.828 (98.828) Mem 22339MB +[2024-07-25 10:22:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.159) Loss 0.7739 (0.6310) Acc@1 81.982 (87.047) Acc@5 96.875 (97.958) Mem 22339MB +[2024-07-25 10:22:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.8633 (0.7272) Acc@1 79.590 (84.145) Acc@5 95.947 (97.033) Mem 22339MB +[2024-07-25 10:22:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.715 Acc@5 97.009 +[2024-07-25 10:22:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.7% +[2024-07-25 10:22:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.876 (0.876) Loss 0.5024 (0.5024) Acc@1 90.234 (90.234) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-25 10:22:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.194) Loss 0.7520 (0.6213) Acc@1 83.350 (87.380) Acc@5 96.484 (97.985) Mem 22339MB +[2024-07-25 10:22:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.161) Loss 0.8716 (0.7146) Acc@1 78.760 (84.468) Acc@5 95.947 (97.091) Mem 22339MB +[2024-07-25 10:22:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.065 Acc@5 97.079 +[2024-07-25 10:22:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.1% +[2024-07-25 10:22:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.07% +[2024-07-25 10:22:22 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 10:22:23 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 10:22:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][0/625] eta 0:09:31 lr 0.000366 wd 0.0500 time 0.9151 (0.9151) data time 0.3945 (0.3945) model time 0.0000 (0.0000) loss 6.9239 (6.9239) grad_norm 1.8726 (1.8726) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:22:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][10/625] eta 0:06:12 lr 0.000366 wd 0.0500 time 0.5724 (0.6063) data time 0.0007 (0.0367) model time 0.0000 (0.0000) loss 8.0046 (7.2788) grad_norm 2.4066 (2.2952) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:22:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][20/625] eta 0:05:57 lr 0.000366 wd 0.0500 time 0.5694 (0.5915) data time 0.0008 (0.0197) model time 0.0000 (0.0000) loss 8.1514 (7.3564) grad_norm 1.8873 (2.2700) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:22:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][30/625] eta 0:05:48 lr 0.000366 wd 0.0500 time 0.5737 (0.5858) data time 0.0006 (0.0136) model time 0.0000 (0.0000) loss 5.9054 (7.1588) grad_norm 3.0525 (2.2770) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:22:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][40/625] eta 0:05:41 lr 0.000366 wd 0.0500 time 0.5646 (0.5830) data time 0.0006 (0.0105) model time 0.0000 (0.0000) loss 7.1711 (7.2236) grad_norm 2.0101 (2.4687) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:22:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][50/625] eta 0:05:34 lr 0.000366 wd 0.0500 time 0.5704 (0.5809) data time 0.0008 (0.0086) model time 0.0000 (0.0000) loss 6.6880 (7.2583) grad_norm 2.2696 (2.5482) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:22:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][60/625] eta 0:05:27 lr 0.000366 wd 0.0500 time 0.5581 (0.5797) data time 0.0006 (0.0073) model time 0.5575 (0.5730) loss 6.3785 (7.2507) grad_norm 1.8583 (2.5249) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:23:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][70/625] eta 0:05:21 lr 0.000366 wd 0.0500 time 0.5710 (0.5788) data time 0.0007 (0.0064) model time 0.5702 (0.5727) loss 7.8489 (7.2889) grad_norm 3.3600 (2.5424) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:23:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][80/625] eta 0:05:15 lr 0.000366 wd 0.0500 time 0.5738 (0.5783) data time 0.0007 (0.0057) model time 0.5730 (0.5731) loss 6.9933 (7.3113) grad_norm 2.3023 (2.5103) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:23:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][90/625] eta 0:05:09 lr 0.000366 wd 0.0500 time 0.5820 (0.5778) data time 0.0006 (0.0052) model time 0.5814 (0.5731) loss 7.9255 (7.3055) grad_norm 2.0784 (2.4913) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:23:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][100/625] eta 0:05:03 lr 0.000365 wd 0.0500 time 0.5718 (0.5775) data time 0.0006 (0.0048) model time 0.5712 (0.5731) loss 6.7743 (7.3080) grad_norm 2.3021 (2.4632) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:23:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][110/625] eta 0:04:57 lr 0.000365 wd 0.0500 time 0.5746 (0.5774) data time 0.0006 (0.0044) model time 0.5740 (0.5735) loss 7.5336 (7.3112) grad_norm 3.1340 (2.4840) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:23:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][120/625] eta 0:04:51 lr 0.000365 wd 0.0500 time 0.5725 (0.5770) data time 0.0006 (0.0041) model time 0.5719 (0.5733) loss 7.1652 (7.2857) grad_norm 2.5215 (2.4864) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:23:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][130/625] eta 0:04:46 lr 0.000365 wd 0.0500 time 0.5719 (0.5779) data time 0.0008 (0.0039) model time 0.5711 (0.5751) loss 7.0652 (7.2783) grad_norm 2.3974 (2.4839) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:23:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][140/625] eta 0:04:40 lr 0.000365 wd 0.0500 time 0.5714 (0.5776) data time 0.0008 (0.0037) model time 0.5706 (0.5749) loss 7.2950 (7.2821) grad_norm 2.8111 (2.4708) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:23:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][150/625] eta 0:04:34 lr 0.000365 wd 0.0500 time 0.5727 (0.5774) data time 0.0007 (0.0035) model time 0.5719 (0.5748) loss 7.7093 (7.2858) grad_norm 1.7590 (2.4540) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:23:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][160/625] eta 0:04:28 lr 0.000365 wd 0.0500 time 0.5717 (0.5773) data time 0.0006 (0.0033) model time 0.5711 (0.5748) loss 8.2167 (7.2875) grad_norm 2.6398 (2.4482) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:24:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][170/625] eta 0:04:23 lr 0.000365 wd 0.0500 time 0.5745 (0.5788) data time 0.0008 (0.0032) model time 0.5737 (0.5770) loss 8.6820 (7.3004) grad_norm 1.9481 (2.4332) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:24:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][180/625] eta 0:04:17 lr 0.000365 wd 0.0500 time 0.5694 (0.5796) data time 0.0008 (0.0030) model time 0.5686 (0.5783) loss 6.5933 (7.3033) grad_norm 2.2526 (2.4436) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:24:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][190/625] eta 0:04:13 lr 0.000365 wd 0.0500 time 0.5626 (0.5831) data time 0.0010 (0.0029) model time 0.5616 (0.5831) loss 7.4257 (7.3180) grad_norm 2.0790 (2.4346) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:24:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][200/625] eta 0:04:09 lr 0.000364 wd 0.0500 time 0.5689 (0.5870) data time 0.0007 (0.0028) model time 0.5682 (0.5883) loss 6.8181 (7.3274) grad_norm 1.5683 (2.4332) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:24:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][210/625] eta 0:04:03 lr 0.000364 wd 0.0500 time 0.5764 (0.5872) data time 0.0006 (0.0027) model time 0.5758 (0.5884) loss 6.4801 (7.3023) grad_norm 3.0655 (2.4620) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:24:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][220/625] eta 0:03:58 lr 0.000364 wd 0.0500 time 0.6120 (0.5889) data time 0.0006 (0.0026) model time 0.6114 (0.5904) loss 6.0642 (7.3116) grad_norm 2.3553 (2.4740) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:24:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][230/625] eta 0:03:52 lr 0.000364 wd 0.0500 time 0.5763 (0.5887) data time 0.0008 (0.0026) model time 0.5756 (0.5901) loss 6.0650 (7.3146) grad_norm 2.7481 (2.4854) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:24:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][240/625] eta 0:03:46 lr 0.000364 wd 0.0500 time 0.5716 (0.5881) data time 0.0008 (0.0025) model time 0.5709 (0.5892) loss 8.2921 (7.3154) grad_norm 2.4748 (2.5265) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:24:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][250/625] eta 0:03:40 lr 0.000364 wd 0.0500 time 0.5619 (0.5876) data time 0.0006 (0.0024) model time 0.5613 (0.5884) loss 6.3778 (7.3182) grad_norm 2.3239 (2.5252) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:24:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][260/625] eta 0:03:34 lr 0.000364 wd 0.0500 time 0.5725 (0.5871) data time 0.0006 (0.0024) model time 0.5719 (0.5877) loss 7.7119 (7.3097) grad_norm 1.8061 (2.5046) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:25:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][270/625] eta 0:03:28 lr 0.000364 wd 0.0500 time 0.5717 (0.5866) data time 0.0008 (0.0023) model time 0.5709 (0.5871) loss 7.8568 (7.3043) grad_norm 1.8238 (2.5018) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:25:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][280/625] eta 0:03:22 lr 0.000364 wd 0.0500 time 0.5690 (0.5862) data time 0.0006 (0.0022) model time 0.5684 (0.5865) loss 6.8120 (7.3070) grad_norm 2.0112 (2.5018) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:25:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][290/625] eta 0:03:16 lr 0.000364 wd 0.0500 time 0.5725 (0.5857) data time 0.0008 (0.0022) model time 0.5716 (0.5859) loss 8.3559 (7.3159) grad_norm 2.3651 (2.4919) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:25:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][300/625] eta 0:03:10 lr 0.000364 wd 0.0500 time 0.5713 (0.5854) data time 0.0008 (0.0022) model time 0.5705 (0.5854) loss 8.4489 (7.3299) grad_norm 2.0456 (2.4901) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:25:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][310/625] eta 0:03:04 lr 0.000363 wd 0.0500 time 0.5643 (0.5850) data time 0.0008 (0.0021) model time 0.5635 (0.5850) loss 7.1696 (7.3350) grad_norm 2.1233 (2.4775) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:25:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][320/625] eta 0:02:58 lr 0.000363 wd 0.0500 time 0.5711 (0.5847) data time 0.0006 (0.0021) model time 0.5705 (0.5845) loss 6.1805 (7.3155) grad_norm 2.2250 (2.4870) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:25:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][330/625] eta 0:02:52 lr 0.000363 wd 0.0500 time 0.5713 (0.5844) data time 0.0008 (0.0020) model time 0.5705 (0.5842) loss 6.9169 (7.3144) grad_norm 2.6435 (2.4900) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:25:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][340/625] eta 0:02:46 lr 0.000363 wd 0.0500 time 0.5737 (0.5840) data time 0.0006 (0.0020) model time 0.5731 (0.5838) loss 7.1530 (7.3204) grad_norm 1.9943 (2.4906) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:25:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][350/625] eta 0:02:40 lr 0.000363 wd 0.0500 time 0.5703 (0.5839) data time 0.0008 (0.0020) model time 0.5695 (0.5836) loss 6.8333 (7.3249) grad_norm 21.8479 (2.5790) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:25:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][360/625] eta 0:02:34 lr 0.000363 wd 0.0500 time 0.5710 (0.5836) data time 0.0006 (0.0020) model time 0.5704 (0.5832) loss 6.3779 (7.3259) grad_norm 2.1060 (2.5735) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:26:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][370/625] eta 0:02:28 lr 0.000363 wd 0.0500 time 0.5747 (0.5835) data time 0.0008 (0.0019) model time 0.5739 (0.5830) loss 6.1346 (7.3187) grad_norm 2.3367 (2.5738) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:26:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][380/625] eta 0:02:22 lr 0.000363 wd 0.0500 time 0.5741 (0.5832) data time 0.0006 (0.0019) model time 0.5735 (0.5828) loss 7.5610 (7.3193) grad_norm 2.9896 (2.5635) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:26:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][390/625] eta 0:02:17 lr 0.000363 wd 0.0500 time 0.5688 (0.5837) data time 0.0008 (0.0019) model time 0.5680 (0.5833) loss 7.3679 (7.3252) grad_norm 1.9530 (2.5693) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:26:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][400/625] eta 0:02:11 lr 0.000363 wd 0.0500 time 0.5733 (0.5842) data time 0.0007 (0.0018) model time 0.5726 (0.5838) loss 6.0844 (7.3276) grad_norm 1.5971 (2.5701) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:26:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][410/625] eta 0:02:05 lr 0.000362 wd 0.0500 time 0.7388 (0.5855) data time 0.0007 (0.0018) model time 0.7381 (0.5853) loss 8.0004 (7.3321) grad_norm 1.7547 (2.5725) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:26:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][420/625] eta 0:02:00 lr 0.000362 wd 0.0500 time 0.5761 (0.5864) data time 0.0008 (0.0018) model time 0.5753 (0.5863) loss 7.0179 (7.3333) grad_norm 2.7293 (2.5602) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:26:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][430/625] eta 0:01:54 lr 0.000362 wd 0.0500 time 0.5612 (0.5864) data time 0.0008 (0.0018) model time 0.5604 (0.5863) loss 8.7457 (7.3398) grad_norm 4.1599 (2.5664) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:26:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][440/625] eta 0:01:48 lr 0.000362 wd 0.0500 time 0.7136 (0.5875) data time 0.0008 (0.0017) model time 0.7128 (0.5875) loss 7.6206 (7.3322) grad_norm 2.4466 (2.5615) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:26:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][450/625] eta 0:01:42 lr 0.000362 wd 0.0500 time 0.5748 (0.5873) data time 0.0008 (0.0017) model time 0.5740 (0.5872) loss 7.5725 (7.3320) grad_norm 1.7698 (2.5547) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:26:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][460/625] eta 0:01:36 lr 0.000362 wd 0.0500 time 0.5724 (0.5870) data time 0.0008 (0.0017) model time 0.5716 (0.5869) loss 8.6448 (7.3280) grad_norm 1.8417 (2.5447) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:27:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][470/625] eta 0:01:30 lr 0.000362 wd 0.0500 time 0.5723 (0.5867) data time 0.0008 (0.0017) model time 0.5715 (0.5866) loss 7.7283 (7.3334) grad_norm 3.2383 (2.5477) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:27:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][480/625] eta 0:01:25 lr 0.000362 wd 0.0500 time 0.5725 (0.5865) data time 0.0008 (0.0017) model time 0.5716 (0.5863) loss 7.9455 (7.3282) grad_norm 2.1021 (2.5500) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:27:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][490/625] eta 0:01:19 lr 0.000362 wd 0.0500 time 0.5730 (0.5863) data time 0.0009 (0.0017) model time 0.5722 (0.5860) loss 7.1761 (7.3346) grad_norm 1.8267 (2.5506) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:27:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][500/625] eta 0:01:13 lr 0.000362 wd 0.0500 time 0.5761 (0.5860) data time 0.0006 (0.0016) model time 0.5755 (0.5857) loss 6.4221 (7.3268) grad_norm 3.6074 (2.5475) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:27:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][510/625] eta 0:01:07 lr 0.000361 wd 0.0500 time 0.5712 (0.5858) data time 0.0009 (0.0016) model time 0.5703 (0.5855) loss 7.7811 (7.3267) grad_norm 2.0941 (2.5440) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:27:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][520/625] eta 0:01:01 lr 0.000361 wd 0.0500 time 0.5726 (0.5856) data time 0.0008 (0.0016) model time 0.5718 (0.5852) loss 6.7961 (7.3270) grad_norm 2.3117 (2.5459) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:27:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][530/625] eta 0:00:55 lr 0.000361 wd 0.0500 time 0.5696 (0.5854) data time 0.0007 (0.0016) model time 0.5689 (0.5850) loss 7.7717 (7.3231) grad_norm 4.5048 (2.5531) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:27:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][540/625] eta 0:00:49 lr 0.000361 wd 0.0500 time 0.5737 (0.5852) data time 0.0008 (0.0016) model time 0.5729 (0.5848) loss 8.0303 (7.3207) grad_norm 1.8868 (2.5527) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:27:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][550/625] eta 0:00:43 lr 0.000361 wd 0.0500 time 0.5681 (0.5850) data time 0.0008 (0.0016) model time 0.5674 (0.5845) loss 7.3376 (7.3208) grad_norm 2.4063 (2.5550) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:27:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][560/625] eta 0:00:38 lr 0.000361 wd 0.0500 time 0.5725 (0.5848) data time 0.0006 (0.0016) model time 0.5719 (0.5843) loss 6.8748 (7.3224) grad_norm 3.2671 (2.5581) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:27:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][570/625] eta 0:00:32 lr 0.000361 wd 0.0500 time 0.5696 (0.5848) data time 0.0008 (0.0015) model time 0.5688 (0.5844) loss 6.2537 (7.3240) grad_norm 3.1847 (2.5679) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:28:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][580/625] eta 0:00:26 lr 0.000361 wd 0.0500 time 0.5744 (0.5846) data time 0.0006 (0.0015) model time 0.5737 (0.5842) loss 6.3648 (7.3181) grad_norm 2.5003 (2.5692) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:28:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][590/625] eta 0:00:20 lr 0.000361 wd 0.0500 time 0.5751 (0.5845) data time 0.0006 (0.0015) model time 0.5745 (0.5840) loss 7.8817 (7.3170) grad_norm 2.5141 (2.5650) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:28:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][600/625] eta 0:00:14 lr 0.000361 wd 0.0500 time 0.5746 (0.5843) data time 0.0006 (0.0015) model time 0.5740 (0.5838) loss 6.7957 (7.3157) grad_norm 2.0783 (2.5576) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:28:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][610/625] eta 0:00:08 lr 0.000360 wd 0.0500 time 0.5692 (0.5844) data time 0.0006 (0.0015) model time 0.5686 (0.5839) loss 8.5600 (7.3194) grad_norm 2.1651 (2.5512) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:28:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [197/300][620/625] eta 0:00:02 lr 0.000360 wd 0.0500 time 0.5708 (0.5844) data time 0.0004 (0.0015) model time 0.5704 (0.5838) loss 6.7958 (7.3167) grad_norm 2.3055 (2.5489) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:28:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 197 training takes 0:06:05 +[2024-07-25 10:28:29 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 10:28:30 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 10:28:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.483 (0.483) Loss 0.5171 (0.5171) Acc@1 89.355 (89.355) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 10:28:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.158) Loss 0.7642 (0.6246) Acc@1 82.715 (87.052) Acc@5 96.582 (97.945) Mem 22339MB +[2024-07-25 10:28:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.143) Loss 0.8789 (0.7226) Acc@1 79.102 (84.240) Acc@5 95.850 (97.024) Mem 22339MB +[2024-07-25 10:28:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.845 Acc@5 97.029 +[2024-07-25 10:28:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.8% +[2024-07-25 10:28:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.783 (0.783) Loss 0.5024 (0.5024) Acc@1 90.283 (90.283) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-25 10:28:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.185) Loss 0.7520 (0.6212) Acc@1 83.447 (87.389) Acc@5 96.582 (98.002) Mem 22339MB +[2024-07-25 10:28:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.157) Loss 0.8716 (0.7144) Acc@1 78.662 (84.456) Acc@5 95.947 (97.110) Mem 22339MB +[2024-07-25 10:28:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.061 Acc@5 97.099 +[2024-07-25 10:28:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.1% +[2024-07-25 10:28:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][0/625] eta 0:15:20 lr 0.000360 wd 0.0500 time 1.4732 (1.4732) data time 0.5287 (0.5287) model time 0.0000 (0.0000) loss 8.6034 (8.6034) grad_norm 2.4790 (2.4790) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:28:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][10/625] eta 0:07:31 lr 0.000360 wd 0.0500 time 0.7459 (0.7345) data time 0.0008 (0.0488) model time 0.0000 (0.0000) loss 7.7482 (7.3078) grad_norm 2.0621 (2.3240) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:28:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][20/625] eta 0:06:54 lr 0.000360 wd 0.0500 time 0.7460 (0.6855) data time 0.0007 (0.0260) model time 0.0000 (0.0000) loss 6.6363 (7.3159) grad_norm 1.8253 (2.5953) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:28:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][30/625] eta 0:06:34 lr 0.000360 wd 0.0500 time 0.7369 (0.6637) data time 0.0008 (0.0179) model time 0.0000 (0.0000) loss 8.8619 (7.2555) grad_norm 1.9512 (3.3721) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:29:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][40/625] eta 0:06:17 lr 0.000360 wd 0.0500 time 0.5733 (0.6445) data time 0.0009 (0.0137) model time 0.0000 (0.0000) loss 8.8585 (7.2347) grad_norm 1.6771 (3.1258) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:29:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][50/625] eta 0:06:02 lr 0.000360 wd 0.0500 time 0.5771 (0.6307) data time 0.0007 (0.0112) model time 0.0000 (0.0000) loss 6.1251 (7.1367) grad_norm 2.5857 (3.0086) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:29:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][60/625] eta 0:05:51 lr 0.000360 wd 0.0500 time 0.5715 (0.6215) data time 0.0006 (0.0095) model time 0.5710 (0.5733) loss 8.6969 (7.1436) grad_norm 2.2592 (2.8766) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:29:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][70/625] eta 0:05:41 lr 0.000360 wd 0.0500 time 0.5773 (0.6151) data time 0.0006 (0.0083) model time 0.5766 (0.5743) loss 7.8874 (7.2343) grad_norm 2.2286 (2.7855) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:29:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][80/625] eta 0:05:32 lr 0.000360 wd 0.0500 time 0.5779 (0.6101) data time 0.0008 (0.0074) model time 0.5771 (0.5741) loss 7.6315 (7.2369) grad_norm 1.9036 (2.7124) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:29:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][90/625] eta 0:05:24 lr 0.000359 wd 0.0500 time 0.5740 (0.6061) data time 0.0006 (0.0067) model time 0.5734 (0.5739) loss 8.1054 (7.2702) grad_norm 2.4846 (2.7167) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:29:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][100/625] eta 0:05:16 lr 0.000359 wd 0.0500 time 0.5726 (0.6029) data time 0.0006 (0.0061) model time 0.5720 (0.5738) loss 5.6867 (7.2517) grad_norm 1.9796 (2.6766) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:29:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][110/625] eta 0:05:09 lr 0.000359 wd 0.0500 time 0.5755 (0.6003) data time 0.0008 (0.0056) model time 0.5747 (0.5737) loss 8.2337 (7.2750) grad_norm 1.7143 (2.6481) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:29:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][120/625] eta 0:05:02 lr 0.000359 wd 0.0500 time 0.5817 (0.5983) data time 0.0008 (0.0052) model time 0.5809 (0.5738) loss 6.8054 (7.2900) grad_norm 1.4665 (2.6042) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:29:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][130/625] eta 0:04:55 lr 0.000359 wd 0.0500 time 0.5743 (0.5965) data time 0.0008 (0.0049) model time 0.5734 (0.5738) loss 8.7306 (7.2801) grad_norm 2.8109 (2.5821) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:30:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][140/625] eta 0:04:48 lr 0.000359 wd 0.0500 time 0.5767 (0.5953) data time 0.0006 (0.0046) model time 0.5761 (0.5744) loss 7.6708 (7.2963) grad_norm 2.3505 (2.5533) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:30:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][150/625] eta 0:04:42 lr 0.000359 wd 0.0500 time 0.5744 (0.5939) data time 0.0006 (0.0043) model time 0.5738 (0.5742) loss 6.5950 (7.2854) grad_norm 1.9065 (2.5352) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:30:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][160/625] eta 0:04:35 lr 0.000359 wd 0.0500 time 0.5750 (0.5927) data time 0.0006 (0.0041) model time 0.5744 (0.5742) loss 6.9886 (7.2866) grad_norm 3.8641 (2.5426) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:30:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][170/625] eta 0:04:29 lr 0.000359 wd 0.0500 time 0.5744 (0.5916) data time 0.0009 (0.0039) model time 0.5735 (0.5742) loss 6.0904 (7.3083) grad_norm 1.6587 (2.5651) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:30:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][180/625] eta 0:04:22 lr 0.000359 wd 0.0500 time 0.5737 (0.5907) data time 0.0008 (0.0038) model time 0.5729 (0.5742) loss 9.1108 (7.3187) grad_norm 1.5715 (2.5578) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:30:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][190/625] eta 0:04:16 lr 0.000359 wd 0.0500 time 0.5777 (0.5899) data time 0.0009 (0.0036) model time 0.5768 (0.5741) loss 8.3229 (7.3106) grad_norm 1.8433 (2.5394) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:30:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][200/625] eta 0:04:10 lr 0.000358 wd 0.0500 time 0.5745 (0.5893) data time 0.0006 (0.0035) model time 0.5739 (0.5744) loss 7.4799 (7.3150) grad_norm 2.1000 (2.5280) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:30:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][210/625] eta 0:04:04 lr 0.000358 wd 0.0500 time 0.5787 (0.5891) data time 0.0006 (0.0034) model time 0.5781 (0.5750) loss 7.5873 (7.3206) grad_norm 2.0747 (2.5165) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:30:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][220/625] eta 0:03:58 lr 0.000358 wd 0.0500 time 0.6133 (0.5892) data time 0.0008 (0.0032) model time 0.6125 (0.5759) loss 7.3290 (7.3048) grad_norm 2.0391 (2.5066) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:30:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][230/625] eta 0:03:53 lr 0.000358 wd 0.0500 time 0.5759 (0.5920) data time 0.0007 (0.0031) model time 0.5752 (0.5802) loss 7.4957 (7.3074) grad_norm 2.6661 (2.4819) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:31:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][240/625] eta 0:03:49 lr 0.000358 wd 0.0500 time 0.5767 (0.5949) data time 0.0006 (0.0031) model time 0.5761 (0.5844) loss 6.6574 (7.3038) grad_norm 2.3705 (2.4708) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:31:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][250/625] eta 0:03:42 lr 0.000358 wd 0.0500 time 0.5751 (0.5945) data time 0.0006 (0.0030) model time 0.5744 (0.5844) loss 8.4683 (7.3092) grad_norm 2.3548 (2.5089) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:31:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][260/625] eta 0:03:37 lr 0.000358 wd 0.0500 time 0.7014 (0.5957) data time 0.0008 (0.0029) model time 0.7006 (0.5863) loss 8.3889 (7.3220) grad_norm 2.4337 (2.5124) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:31:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][270/625] eta 0:03:31 lr 0.000358 wd 0.0500 time 0.5736 (0.5949) data time 0.0008 (0.0028) model time 0.5728 (0.5857) loss 7.0659 (7.3204) grad_norm 2.0181 (2.5052) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:31:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][280/625] eta 0:03:24 lr 0.000358 wd 0.0500 time 0.5749 (0.5942) data time 0.0008 (0.0027) model time 0.5741 (0.5852) loss 7.9042 (7.3085) grad_norm 5.4095 (2.5782) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:31:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][290/625] eta 0:03:18 lr 0.000358 wd 0.0500 time 0.5783 (0.5935) data time 0.0007 (0.0027) model time 0.5776 (0.5848) loss 7.2616 (7.3024) grad_norm 2.9794 (2.6004) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:31:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][300/625] eta 0:03:12 lr 0.000357 wd 0.0500 time 0.6054 (0.5930) data time 0.0006 (0.0026) model time 0.6049 (0.5844) loss 6.6224 (7.3151) grad_norm 1.8973 (2.5935) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:31:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][310/625] eta 0:03:06 lr 0.000357 wd 0.0500 time 0.5755 (0.5924) data time 0.0006 (0.0026) model time 0.5749 (0.5840) loss 7.6809 (7.3280) grad_norm 2.0444 (2.5818) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:31:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][320/625] eta 0:03:00 lr 0.000357 wd 0.0500 time 0.5774 (0.5918) data time 0.0006 (0.0025) model time 0.5768 (0.5836) loss 8.9306 (7.3295) grad_norm 1.9383 (2.5706) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:31:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][330/625] eta 0:02:54 lr 0.000357 wd 0.0500 time 0.5721 (0.5913) data time 0.0006 (0.0025) model time 0.5715 (0.5833) loss 5.7373 (7.3143) grad_norm 1.9633 (2.5617) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:31:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][340/625] eta 0:02:48 lr 0.000357 wd 0.0500 time 0.5780 (0.5908) data time 0.0007 (0.0024) model time 0.5773 (0.5829) loss 8.1154 (7.3092) grad_norm 2.9163 (2.5755) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:32:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][350/625] eta 0:02:42 lr 0.000357 wd 0.0500 time 0.5753 (0.5903) data time 0.0008 (0.0024) model time 0.5745 (0.5826) loss 8.3187 (7.3069) grad_norm 1.8166 (2.5718) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:32:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][360/625] eta 0:02:36 lr 0.000357 wd 0.0500 time 0.5766 (0.5901) data time 0.0007 (0.0023) model time 0.5758 (0.5825) loss 8.7446 (7.3015) grad_norm 3.7594 (2.5762) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:32:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][370/625] eta 0:02:30 lr 0.000357 wd 0.0500 time 0.5760 (0.5896) data time 0.0008 (0.0023) model time 0.5752 (0.5822) loss 7.5479 (7.3026) grad_norm 1.8006 (2.5633) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:32:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][380/625] eta 0:02:24 lr 0.000357 wd 0.0500 time 0.5806 (0.5892) data time 0.0008 (0.0022) model time 0.5798 (0.5819) loss 8.5500 (7.3140) grad_norm 2.1297 (2.5564) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:32:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][390/625] eta 0:02:18 lr 0.000357 wd 0.0500 time 0.5754 (0.5889) data time 0.0007 (0.0022) model time 0.5746 (0.5817) loss 7.2054 (7.3185) grad_norm 2.3789 (2.5460) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:32:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][400/625] eta 0:02:12 lr 0.000356 wd 0.0500 time 0.5792 (0.5886) data time 0.0006 (0.0022) model time 0.5786 (0.5815) loss 6.8174 (7.3254) grad_norm 9.4503 (2.5546) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:32:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][410/625] eta 0:02:06 lr 0.000356 wd 0.0500 time 0.5843 (0.5883) data time 0.0008 (0.0021) model time 0.5834 (0.5814) loss 7.6919 (7.3260) grad_norm 1.6208 (2.5509) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:32:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][420/625] eta 0:02:00 lr 0.000356 wd 0.0500 time 0.5748 (0.5883) data time 0.0006 (0.0021) model time 0.5742 (0.5815) loss 6.2226 (7.3217) grad_norm 3.2627 (2.5502) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:32:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][430/625] eta 0:01:54 lr 0.000356 wd 0.0500 time 0.5756 (0.5886) data time 0.0008 (0.0021) model time 0.5748 (0.5821) loss 7.7546 (7.3263) grad_norm 1.4834 (2.5569) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:32:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][440/625] eta 0:01:48 lr 0.000356 wd 0.0500 time 0.6748 (0.5886) data time 0.0006 (0.0021) model time 0.6742 (0.5822) loss 8.6323 (7.3253) grad_norm 2.5073 (2.5587) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:33:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][450/625] eta 0:01:43 lr 0.000356 wd 0.0500 time 0.7450 (0.5901) data time 0.0006 (0.0020) model time 0.7444 (0.5841) loss 6.0759 (7.3227) grad_norm 1.7886 (2.5552) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:33:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][460/625] eta 0:01:37 lr 0.000356 wd 0.0500 time 0.5682 (0.5910) data time 0.0006 (0.0020) model time 0.5676 (0.5852) loss 7.7066 (7.3283) grad_norm 2.0438 (2.5463) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:33:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][470/625] eta 0:01:31 lr 0.000356 wd 0.0500 time 0.7656 (0.5910) data time 0.0007 (0.0020) model time 0.7650 (0.5853) loss 7.9912 (7.3211) grad_norm 2.4611 (2.5445) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:33:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][480/625] eta 0:01:25 lr 0.000356 wd 0.0500 time 0.6840 (0.5913) data time 0.0008 (0.0020) model time 0.6832 (0.5857) loss 8.0060 (7.3180) grad_norm 2.0006 (2.5408) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:33:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][490/625] eta 0:01:19 lr 0.000356 wd 0.0500 time 0.5722 (0.5909) data time 0.0006 (0.0019) model time 0.5715 (0.5854) loss 6.5314 (7.3248) grad_norm 2.3370 (2.5522) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:33:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][500/625] eta 0:01:13 lr 0.000356 wd 0.0500 time 0.5775 (0.5906) data time 0.0006 (0.0019) model time 0.5769 (0.5852) loss 9.0646 (7.3257) grad_norm 1.9724 (2.5461) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:33:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][510/625] eta 0:01:07 lr 0.000355 wd 0.0500 time 0.5738 (0.5904) data time 0.0006 (0.0019) model time 0.5733 (0.5850) loss 8.0041 (7.3142) grad_norm 1.7301 (2.5391) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:33:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][520/625] eta 0:01:01 lr 0.000355 wd 0.0500 time 0.5745 (0.5901) data time 0.0008 (0.0019) model time 0.5737 (0.5848) loss 7.8012 (7.3175) grad_norm 2.1708 (2.5443) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:33:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][530/625] eta 0:00:56 lr 0.000355 wd 0.0500 time 0.5741 (0.5898) data time 0.0008 (0.0019) model time 0.5733 (0.5845) loss 8.4123 (7.3196) grad_norm 2.2376 (2.5453) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:33:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][540/625] eta 0:00:50 lr 0.000355 wd 0.0500 time 0.5768 (0.5895) data time 0.0008 (0.0018) model time 0.5760 (0.5843) loss 6.3161 (7.3151) grad_norm 2.7484 (2.5444) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:34:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][550/625] eta 0:00:44 lr 0.000355 wd 0.0500 time 0.5754 (0.5893) data time 0.0007 (0.0018) model time 0.5746 (0.5842) loss 8.3815 (7.3142) grad_norm 1.8162 (2.5461) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:34:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][560/625] eta 0:00:38 lr 0.000355 wd 0.0500 time 0.5742 (0.5890) data time 0.0006 (0.0018) model time 0.5736 (0.5840) loss 7.9507 (7.3107) grad_norm 2.7771 (2.5468) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:34:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][570/625] eta 0:00:32 lr 0.000355 wd 0.0500 time 0.5734 (0.5888) data time 0.0008 (0.0018) model time 0.5726 (0.5838) loss 7.8750 (7.3108) grad_norm 2.1408 (2.5407) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:34:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][580/625] eta 0:00:26 lr 0.000355 wd 0.0500 time 0.5756 (0.5887) data time 0.0008 (0.0018) model time 0.5748 (0.5838) loss 8.1825 (7.3073) grad_norm 2.3047 (2.5405) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:34:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][590/625] eta 0:00:20 lr 0.000355 wd 0.0500 time 0.5760 (0.5885) data time 0.0006 (0.0018) model time 0.5754 (0.5836) loss 7.9331 (7.3040) grad_norm 2.7490 (2.5429) loss_scale 1024.0000 (512.8663) mem 22339MB +[2024-07-25 10:34:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][600/625] eta 0:00:14 lr 0.000355 wd 0.0500 time 0.5744 (0.5883) data time 0.0006 (0.0017) model time 0.5737 (0.5835) loss 6.9905 (7.3115) grad_norm 2.2134 (2.5351) loss_scale 1024.0000 (521.3710) mem 22339MB +[2024-07-25 10:34:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][610/625] eta 0:00:08 lr 0.000354 wd 0.0500 time 0.5749 (0.5881) data time 0.0006 (0.0017) model time 0.5743 (0.5833) loss 7.9290 (7.3102) grad_norm 1.9405 (2.5361) loss_scale 1024.0000 (529.5974) mem 22339MB +[2024-07-25 10:34:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [198/300][620/625] eta 0:00:02 lr 0.000354 wd 0.0500 time 0.5744 (0.5879) data time 0.0005 (0.0017) model time 0.5739 (0.5832) loss 6.9818 (7.3058) grad_norm 2.0830 (2.5330) loss_scale 1024.0000 (537.5588) mem 22339MB +[2024-07-25 10:34:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 198 training takes 0:06:07 +[2024-07-25 10:34:45 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 10:34:46 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 10:34:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.475 (0.475) Loss 0.4995 (0.4995) Acc@1 90.186 (90.186) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-25 10:34:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.7793 (0.6222) Acc@1 82.129 (87.140) Acc@5 96.680 (97.909) Mem 22339MB +[2024-07-25 10:34:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.127 (0.142) Loss 0.8760 (0.7191) Acc@1 78.662 (84.342) Acc@5 95.947 (97.017) Mem 22339MB +[2024-07-25 10:34:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.961 Acc@5 97.027 +[2024-07-25 10:34:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.0% +[2024-07-25 10:34:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 83.96% +[2024-07-25 10:34:50 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 10:34:51 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 10:34:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.487 (0.487) Loss 0.5029 (0.5029) Acc@1 90.283 (90.283) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-25 10:34:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.7524 (0.6214) Acc@1 83.447 (87.385) Acc@5 96.631 (98.011) Mem 22339MB +[2024-07-25 10:34:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.8716 (0.7144) Acc@1 78.662 (84.459) Acc@5 95.996 (97.119) Mem 22339MB +[2024-07-25 10:34:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.067 Acc@5 97.113 +[2024-07-25 10:34:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.1% +[2024-07-25 10:34:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.07% +[2024-07-25 10:34:55 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 10:34:56 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 10:34:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][0/625] eta 0:09:24 lr 0.000354 wd 0.0500 time 0.9036 (0.9036) data time 0.3859 (0.3859) model time 0.0000 (0.0000) loss 7.5622 (7.5622) grad_norm 2.0710 (2.0710) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:35:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][10/625] eta 0:06:10 lr 0.000354 wd 0.0500 time 0.5620 (0.6023) data time 0.0008 (0.0359) model time 0.0000 (0.0000) loss 7.7396 (7.9873) grad_norm 4.1148 (3.2634) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:35:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][20/625] eta 0:06:02 lr 0.000354 wd 0.0500 time 0.6969 (0.5998) data time 0.0006 (0.0192) model time 0.0000 (0.0000) loss 6.7287 (7.5119) grad_norm 1.7162 (2.8707) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:35:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][30/625] eta 0:05:54 lr 0.000354 wd 0.0500 time 0.5683 (0.5958) data time 0.0007 (0.0133) model time 0.0000 (0.0000) loss 6.8325 (7.5072) grad_norm 1.8004 (3.1017) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:35:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][40/625] eta 0:05:49 lr 0.000354 wd 0.0500 time 0.5745 (0.5975) data time 0.0008 (0.0103) model time 0.0000 (0.0000) loss 7.0967 (7.3749) grad_norm 2.2378 (2.9286) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:35:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][50/625] eta 0:05:46 lr 0.000354 wd 0.0500 time 0.7327 (0.6034) data time 0.0006 (0.0084) model time 0.0000 (0.0000) loss 7.9681 (7.4589) grad_norm 2.3016 (2.7884) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:35:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][60/625] eta 0:05:41 lr 0.000354 wd 0.0500 time 0.5714 (0.6041) data time 0.0009 (0.0072) model time 0.5705 (0.6066) loss 7.3772 (7.5259) grad_norm 1.7676 (2.6781) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:35:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][70/625] eta 0:05:35 lr 0.000354 wd 0.0500 time 0.5710 (0.6044) data time 0.0006 (0.0063) model time 0.5704 (0.6059) loss 8.2848 (7.5208) grad_norm 2.1966 (2.7235) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:35:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][80/625] eta 0:05:28 lr 0.000354 wd 0.0500 time 0.5712 (0.6026) data time 0.0008 (0.0056) model time 0.5704 (0.6004) loss 6.7321 (7.4886) grad_norm 2.2613 (2.6698) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:35:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][90/625] eta 0:05:20 lr 0.000353 wd 0.0500 time 0.5718 (0.5994) data time 0.0007 (0.0051) model time 0.5711 (0.5935) loss 6.4487 (7.4742) grad_norm 2.0434 (2.6266) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:35:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][100/625] eta 0:05:13 lr 0.000353 wd 0.0500 time 0.5666 (0.5967) data time 0.0008 (0.0047) model time 0.5657 (0.5891) loss 7.0195 (7.4463) grad_norm 2.9824 (2.6196) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:36:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][110/625] eta 0:05:06 lr 0.000353 wd 0.0500 time 0.5712 (0.5947) data time 0.0007 (0.0043) model time 0.5704 (0.5864) loss 7.1806 (7.4746) grad_norm 1.8932 (2.6088) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:36:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][120/625] eta 0:04:59 lr 0.000353 wd 0.0500 time 0.5658 (0.5930) data time 0.0006 (0.0041) model time 0.5652 (0.5845) loss 7.2546 (7.4698) grad_norm 3.2173 (2.6608) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:36:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][130/625] eta 0:04:52 lr 0.000353 wd 0.0500 time 0.5738 (0.5916) data time 0.0008 (0.0038) model time 0.5730 (0.5831) loss 7.8233 (7.4625) grad_norm 3.0670 (2.6359) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:36:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][140/625] eta 0:04:46 lr 0.000353 wd 0.0500 time 0.5653 (0.5906) data time 0.0007 (0.0036) model time 0.5646 (0.5824) loss 8.0572 (7.4461) grad_norm 2.9155 (2.6028) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:36:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][150/625] eta 0:04:40 lr 0.000353 wd 0.0500 time 0.5632 (0.5895) data time 0.0008 (0.0034) model time 0.5624 (0.5816) loss 6.8543 (7.4425) grad_norm 2.0357 (2.5646) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:36:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][160/625] eta 0:04:33 lr 0.000353 wd 0.0500 time 0.5740 (0.5886) data time 0.0007 (0.0033) model time 0.5733 (0.5809) loss 6.9935 (7.4270) grad_norm 2.1590 (2.5335) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:36:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][170/625] eta 0:04:27 lr 0.000353 wd 0.0500 time 0.5730 (0.5879) data time 0.0007 (0.0031) model time 0.5723 (0.5804) loss 7.8472 (7.4210) grad_norm 1.7132 (2.5184) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:36:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][180/625] eta 0:04:21 lr 0.000353 wd 0.0500 time 0.5738 (0.5872) data time 0.0006 (0.0030) model time 0.5732 (0.5799) loss 8.0941 (7.4329) grad_norm 3.8198 (2.5163) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:36:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][190/625] eta 0:04:15 lr 0.000352 wd 0.0500 time 0.5660 (0.5865) data time 0.0008 (0.0029) model time 0.5652 (0.5795) loss 6.8961 (7.4242) grad_norm 2.8991 (2.5357) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:36:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][200/625] eta 0:04:09 lr 0.000352 wd 0.0500 time 0.5722 (0.5860) data time 0.0007 (0.0028) model time 0.5714 (0.5792) loss 6.6226 (7.4291) grad_norm 3.8127 (2.6287) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:37:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][210/625] eta 0:04:02 lr 0.000352 wd 0.0500 time 0.5730 (0.5854) data time 0.0007 (0.0027) model time 0.5723 (0.5788) loss 6.8116 (7.4341) grad_norm 2.5239 (2.6210) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:37:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][220/625] eta 0:03:56 lr 0.000352 wd 0.0500 time 0.5750 (0.5850) data time 0.0006 (0.0026) model time 0.5744 (0.5786) loss 7.6658 (7.4381) grad_norm 1.8667 (2.6127) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:37:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][230/625] eta 0:03:50 lr 0.000352 wd 0.0500 time 0.5712 (0.5846) data time 0.0008 (0.0025) model time 0.5704 (0.5783) loss 7.1029 (7.4455) grad_norm 2.3631 (2.5999) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:37:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][240/625] eta 0:03:45 lr 0.000352 wd 0.0500 time 0.5734 (0.5853) data time 0.0008 (0.0025) model time 0.5726 (0.5795) loss 7.4325 (7.4655) grad_norm 1.9592 (2.5837) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:37:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][250/625] eta 0:03:39 lr 0.000352 wd 0.0500 time 0.5749 (0.5855) data time 0.0006 (0.0024) model time 0.5743 (0.5801) loss 7.4998 (7.4787) grad_norm 2.7397 (2.5720) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:37:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][260/625] eta 0:03:34 lr 0.000352 wd 0.0500 time 0.7323 (0.5869) data time 0.0008 (0.0024) model time 0.7314 (0.5820) loss 7.3269 (7.4778) grad_norm 2.0669 (2.5593) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:37:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][270/625] eta 0:03:28 lr 0.000352 wd 0.0500 time 0.7569 (0.5885) data time 0.0008 (0.0023) model time 0.7561 (0.5842) loss 8.5937 (7.4620) grad_norm 1.7268 (2.5396) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:37:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][280/625] eta 0:03:23 lr 0.000352 wd 0.0500 time 0.5757 (0.5890) data time 0.0007 (0.0023) model time 0.5750 (0.5849) loss 8.1480 (7.4767) grad_norm 3.8604 (2.5374) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:37:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][290/625] eta 0:03:17 lr 0.000351 wd 0.0500 time 0.5712 (0.5900) data time 0.0006 (0.0022) model time 0.5705 (0.5863) loss 5.9976 (7.4809) grad_norm 2.9207 (2.5469) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:37:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][300/625] eta 0:03:11 lr 0.000351 wd 0.0500 time 0.5708 (0.5903) data time 0.0006 (0.0022) model time 0.5702 (0.5868) loss 8.6226 (7.4720) grad_norm 1.8109 (2.5680) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:38:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][310/625] eta 0:03:05 lr 0.000351 wd 0.0500 time 0.5670 (0.5898) data time 0.0006 (0.0021) model time 0.5664 (0.5863) loss 7.2357 (7.4719) grad_norm 1.7136 (2.5538) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:38:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][320/625] eta 0:02:59 lr 0.000351 wd 0.0500 time 0.5728 (0.5894) data time 0.0007 (0.0021) model time 0.5721 (0.5858) loss 6.6372 (7.4789) grad_norm 2.7742 (2.5406) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:38:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][330/625] eta 0:02:53 lr 0.000351 wd 0.0500 time 0.5672 (0.5889) data time 0.0008 (0.0020) model time 0.5664 (0.5854) loss 8.8215 (7.4772) grad_norm 2.8701 (2.5315) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:38:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][340/625] eta 0:02:47 lr 0.000351 wd 0.0500 time 0.5733 (0.5885) data time 0.0008 (0.0020) model time 0.5725 (0.5850) loss 5.7636 (7.4788) grad_norm 1.8295 (2.5233) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:38:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][350/625] eta 0:02:41 lr 0.000351 wd 0.0500 time 0.5698 (0.5881) data time 0.0006 (0.0020) model time 0.5692 (0.5847) loss 7.8267 (7.4794) grad_norm 2.5485 (2.5178) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:38:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][360/625] eta 0:02:35 lr 0.000351 wd 0.0500 time 0.5735 (0.5878) data time 0.0008 (0.0020) model time 0.5727 (0.5843) loss 7.4489 (7.4660) grad_norm 2.6463 (2.5151) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:38:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][370/625] eta 0:02:29 lr 0.000351 wd 0.0500 time 0.5704 (0.5874) data time 0.0006 (0.0019) model time 0.5697 (0.5840) loss 7.0009 (7.4672) grad_norm 1.8369 (2.4993) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:38:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][380/625] eta 0:02:23 lr 0.000351 wd 0.0500 time 0.5701 (0.5872) data time 0.0008 (0.0019) model time 0.5693 (0.5838) loss 7.0136 (7.4488) grad_norm 1.9614 (2.4884) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:38:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][390/625] eta 0:02:17 lr 0.000351 wd 0.0500 time 0.5743 (0.5869) data time 0.0008 (0.0019) model time 0.5735 (0.5835) loss 7.7720 (7.4464) grad_norm 1.6000 (2.4810) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:38:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][400/625] eta 0:02:11 lr 0.000350 wd 0.0500 time 0.5609 (0.5867) data time 0.0006 (0.0018) model time 0.5603 (0.5833) loss 8.6016 (7.4439) grad_norm 1.7538 (2.4708) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:38:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][410/625] eta 0:02:06 lr 0.000350 wd 0.0500 time 0.5621 (0.5864) data time 0.0008 (0.0018) model time 0.5613 (0.5831) loss 7.6961 (7.4351) grad_norm 1.8965 (2.4614) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:39:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][420/625] eta 0:02:00 lr 0.000350 wd 0.0500 time 0.5708 (0.5861) data time 0.0006 (0.0018) model time 0.5702 (0.5829) loss 6.9549 (7.4344) grad_norm 6.8600 (2.4678) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:39:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][430/625] eta 0:01:54 lr 0.000350 wd 0.0500 time 0.5718 (0.5859) data time 0.0008 (0.0018) model time 0.5710 (0.5827) loss 8.6003 (7.4379) grad_norm 2.4122 (2.4764) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 10:39:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][440/625] eta 0:01:48 lr 0.000350 wd 0.0500 time 0.5669 (0.5856) data time 0.0008 (0.0018) model time 0.5661 (0.5824) loss 7.3638 (7.4457) grad_norm 2.2556 (inf) loss_scale 512.0000 (1012.3900) mem 22339MB +[2024-07-25 10:39:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][450/625] eta 0:01:42 lr 0.000350 wd 0.0500 time 0.5728 (0.5854) data time 0.0009 (0.0017) model time 0.5719 (0.5822) loss 7.1114 (7.4426) grad_norm 2.3657 (inf) loss_scale 512.0000 (1001.2949) mem 22339MB +[2024-07-25 10:39:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][460/625] eta 0:01:36 lr 0.000350 wd 0.0500 time 0.5713 (0.5854) data time 0.0008 (0.0017) model time 0.5705 (0.5823) loss 6.0381 (7.4369) grad_norm 3.3454 (inf) loss_scale 512.0000 (990.6811) mem 22339MB +[2024-07-25 10:39:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][470/625] eta 0:01:30 lr 0.000350 wd 0.0500 time 0.5742 (0.5853) data time 0.0008 (0.0017) model time 0.5734 (0.5822) loss 7.8281 (7.4449) grad_norm 7.2675 (inf) loss_scale 512.0000 (980.5180) mem 22339MB +[2024-07-25 10:39:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][480/625] eta 0:01:25 lr 0.000350 wd 0.0500 time 0.5698 (0.5864) data time 0.0006 (0.0017) model time 0.5692 (0.5835) loss 5.8640 (7.4391) grad_norm 3.3765 (inf) loss_scale 512.0000 (970.7775) mem 22339MB +[2024-07-25 10:39:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][490/625] eta 0:01:19 lr 0.000350 wd 0.0500 time 0.7371 (0.5876) data time 0.0006 (0.0017) model time 0.7364 (0.5849) loss 7.2820 (7.4382) grad_norm 2.6757 (inf) loss_scale 512.0000 (961.4338) mem 22339MB +[2024-07-25 10:39:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][500/625] eta 0:01:13 lr 0.000349 wd 0.0500 time 0.5738 (0.5881) data time 0.0009 (0.0016) model time 0.5729 (0.5855) loss 8.2904 (7.4468) grad_norm 1.6385 (inf) loss_scale 512.0000 (952.4631) mem 22339MB +[2024-07-25 10:39:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][510/625] eta 0:01:07 lr 0.000349 wd 0.0500 time 0.7057 (0.5886) data time 0.0006 (0.0016) model time 0.7051 (0.5860) loss 6.3301 (7.4374) grad_norm 2.4543 (inf) loss_scale 512.0000 (943.8434) mem 22339MB +[2024-07-25 10:40:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][520/625] eta 0:01:01 lr 0.000349 wd 0.0500 time 0.5703 (0.5888) data time 0.0006 (0.0016) model time 0.5698 (0.5863) loss 7.1475 (7.4361) grad_norm 2.0090 (inf) loss_scale 512.0000 (935.5547) mem 22339MB +[2024-07-25 10:40:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][530/625] eta 0:00:55 lr 0.000349 wd 0.0500 time 0.5679 (0.5885) data time 0.0007 (0.0016) model time 0.5672 (0.5860) loss 7.4800 (7.4374) grad_norm 2.1171 (inf) loss_scale 512.0000 (927.5782) mem 22339MB +[2024-07-25 10:40:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][540/625] eta 0:00:50 lr 0.000349 wd 0.0500 time 0.5740 (0.5882) data time 0.0008 (0.0016) model time 0.5732 (0.5858) loss 7.9136 (7.4410) grad_norm 3.4198 (inf) loss_scale 512.0000 (919.8965) mem 22339MB +[2024-07-25 10:40:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][550/625] eta 0:00:44 lr 0.000349 wd 0.0500 time 0.5732 (0.5880) data time 0.0008 (0.0016) model time 0.5724 (0.5855) loss 8.8653 (7.4387) grad_norm 4.6945 (inf) loss_scale 512.0000 (912.4936) mem 22339MB +[2024-07-25 10:40:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][560/625] eta 0:00:38 lr 0.000349 wd 0.0500 time 0.5759 (0.5878) data time 0.0008 (0.0016) model time 0.5751 (0.5853) loss 8.1821 (7.4363) grad_norm 4.5096 (inf) loss_scale 512.0000 (905.3547) mem 22339MB +[2024-07-25 10:40:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][570/625] eta 0:00:32 lr 0.000349 wd 0.0500 time 0.5755 (0.5875) data time 0.0006 (0.0016) model time 0.5749 (0.5851) loss 5.9244 (7.4320) grad_norm 2.2881 (inf) loss_scale 512.0000 (898.4658) mem 22339MB +[2024-07-25 10:40:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][580/625] eta 0:00:26 lr 0.000349 wd 0.0500 time 0.5725 (0.5873) data time 0.0008 (0.0015) model time 0.5717 (0.5849) loss 6.7797 (7.4284) grad_norm 1.9013 (inf) loss_scale 512.0000 (891.8141) mem 22339MB +[2024-07-25 10:40:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][590/625] eta 0:00:20 lr 0.000349 wd 0.0500 time 0.5725 (0.5871) data time 0.0006 (0.0015) model time 0.5718 (0.5847) loss 7.6424 (7.4337) grad_norm 2.8071 (inf) loss_scale 512.0000 (885.3875) mem 22339MB +[2024-07-25 10:40:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][600/625] eta 0:00:14 lr 0.000349 wd 0.0500 time 0.5738 (0.5869) data time 0.0006 (0.0015) model time 0.5732 (0.5845) loss 6.2734 (7.4385) grad_norm 3.6587 (inf) loss_scale 512.0000 (879.1747) mem 22339MB +[2024-07-25 10:40:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][610/625] eta 0:00:08 lr 0.000348 wd 0.0500 time 0.5637 (0.5867) data time 0.0004 (0.0015) model time 0.5633 (0.5843) loss 6.7486 (7.4321) grad_norm 4.1160 (inf) loss_scale 512.0000 (873.1653) mem 22339MB +[2024-07-25 10:41:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [199/300][620/625] eta 0:00:02 lr 0.000348 wd 0.0500 time 0.5750 (0.5865) data time 0.0004 (0.0015) model time 0.5746 (0.5841) loss 6.0276 (7.4322) grad_norm 2.8848 (inf) loss_scale 512.0000 (867.3494) mem 22339MB +[2024-07-25 10:41:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 199 training takes 0:06:06 +[2024-07-25 10:41:03 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 10:41:04 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 10:41:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.466 (0.466) Loss 0.5000 (0.5000) Acc@1 89.697 (89.697) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 10:41:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.7637 (0.6203) Acc@1 81.445 (86.945) Acc@5 96.777 (97.945) Mem 22339MB +[2024-07-25 10:41:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8506 (0.7159) Acc@1 79.785 (84.191) Acc@5 95.850 (97.005) Mem 22339MB +[2024-07-25 10:41:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.811 Acc@5 97.013 +[2024-07-25 10:41:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.8% +[2024-07-25 10:41:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.792 (0.792) Loss 0.5034 (0.5034) Acc@1 90.332 (90.332) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-25 10:41:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.186) Loss 0.7524 (0.6215) Acc@1 83.350 (87.407) Acc@5 96.533 (98.011) Mem 22339MB +[2024-07-25 10:41:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.157) Loss 0.8706 (0.7142) Acc@1 78.857 (84.480) Acc@5 96.094 (97.128) Mem 22339MB +[2024-07-25 10:41:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.089 Acc@5 97.119 +[2024-07-25 10:41:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.1% +[2024-07-25 10:41:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.09% +[2024-07-25 10:41:11 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 10:41:13 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 10:41:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][0/625] eta 0:09:03 lr 0.000348 wd 0.0500 time 0.8698 (0.8698) data time 0.3497 (0.3497) model time 0.0000 (0.0000) loss 7.7545 (7.7545) grad_norm 2.2259 (2.2259) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:41:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][10/625] eta 0:06:09 lr 0.000348 wd 0.0500 time 0.5749 (0.6007) data time 0.0006 (0.0331) model time 0.0000 (0.0000) loss 6.5399 (7.0866) grad_norm 2.1365 (2.4874) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:41:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][20/625] eta 0:05:55 lr 0.000348 wd 0.0500 time 0.5702 (0.5880) data time 0.0006 (0.0177) model time 0.0000 (0.0000) loss 6.8103 (6.9829) grad_norm 1.8860 (2.5280) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:41:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][30/625] eta 0:05:47 lr 0.000348 wd 0.0500 time 0.5732 (0.5841) data time 0.0006 (0.0123) model time 0.0000 (0.0000) loss 7.9137 (7.0173) grad_norm 2.2145 (2.7315) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:41:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][40/625] eta 0:05:41 lr 0.000348 wd 0.0500 time 0.5688 (0.5833) data time 0.0006 (0.0095) model time 0.0000 (0.0000) loss 6.2999 (7.0324) grad_norm 2.2099 (2.7745) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:41:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][50/625] eta 0:05:34 lr 0.000348 wd 0.0500 time 0.5720 (0.5812) data time 0.0008 (0.0078) model time 0.0000 (0.0000) loss 7.4831 (7.0206) grad_norm 1.9750 (2.6440) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:41:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][60/625] eta 0:05:29 lr 0.000348 wd 0.0500 time 0.6363 (0.5827) data time 0.0008 (0.0067) model time 0.6355 (0.5896) loss 6.3247 (7.0791) grad_norm 1.8776 (2.6278) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:41:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][70/625] eta 0:05:25 lr 0.000348 wd 0.0500 time 0.7355 (0.5858) data time 0.0006 (0.0059) model time 0.7349 (0.5965) loss 7.2509 (7.1112) grad_norm 1.7629 (2.5796) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:42:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][80/625] eta 0:05:21 lr 0.000348 wd 0.0500 time 0.7724 (0.5901) data time 0.0006 (0.0052) model time 0.7718 (0.6044) loss 8.2784 (7.1775) grad_norm 2.3489 (2.5533) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:42:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][90/625] eta 0:05:19 lr 0.000347 wd 0.0500 time 0.7145 (0.5968) data time 0.0008 (0.0048) model time 0.7137 (0.6157) loss 8.0990 (7.1878) grad_norm 1.5221 (2.4967) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:42:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][100/625] eta 0:05:13 lr 0.000347 wd 0.0500 time 0.5719 (0.5973) data time 0.0006 (0.0044) model time 0.5713 (0.6128) loss 6.5051 (7.1800) grad_norm 2.0055 (2.4710) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:42:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][110/625] eta 0:05:07 lr 0.000347 wd 0.0500 time 0.5718 (0.5973) data time 0.0008 (0.0041) model time 0.5710 (0.6101) loss 7.6661 (7.1676) grad_norm 2.5535 (2.4651) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:42:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][120/625] eta 0:05:01 lr 0.000347 wd 0.0500 time 0.5753 (0.5966) data time 0.0008 (0.0038) model time 0.5745 (0.6069) loss 6.4140 (7.1746) grad_norm 2.1327 (2.4552) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:42:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][130/625] eta 0:04:54 lr 0.000347 wd 0.0500 time 0.5739 (0.5949) data time 0.0008 (0.0036) model time 0.5731 (0.6027) loss 8.0535 (7.1627) grad_norm 4.8623 (2.4845) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:42:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][140/625] eta 0:04:47 lr 0.000347 wd 0.0500 time 0.5713 (0.5935) data time 0.0008 (0.0034) model time 0.5704 (0.5996) loss 7.6242 (7.1969) grad_norm 6.4484 (2.5419) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:42:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][150/625] eta 0:04:41 lr 0.000347 wd 0.0500 time 0.5727 (0.5924) data time 0.0006 (0.0032) model time 0.5721 (0.5972) loss 7.1722 (7.2191) grad_norm 2.0993 (2.5521) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:42:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][160/625] eta 0:04:34 lr 0.000347 wd 0.0500 time 0.5717 (0.5913) data time 0.0008 (0.0031) model time 0.5709 (0.5950) loss 8.1191 (7.2167) grad_norm 2.3016 (2.5668) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:42:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][170/625] eta 0:04:28 lr 0.000347 wd 0.0500 time 0.5729 (0.5903) data time 0.0007 (0.0029) model time 0.5722 (0.5933) loss 6.9044 (7.2344) grad_norm 2.5158 (2.5575) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:43:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][180/625] eta 0:04:22 lr 0.000347 wd 0.0500 time 0.5731 (0.5895) data time 0.0008 (0.0029) model time 0.5723 (0.5918) loss 7.3724 (7.2324) grad_norm 2.0868 (2.5984) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:43:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][190/625] eta 0:04:16 lr 0.000346 wd 0.0500 time 0.5728 (0.5887) data time 0.0008 (0.0028) model time 0.5720 (0.5905) loss 8.1682 (7.2641) grad_norm 2.0524 (2.6166) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:43:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][200/625] eta 0:04:09 lr 0.000346 wd 0.0500 time 0.5742 (0.5880) data time 0.0006 (0.0027) model time 0.5736 (0.5894) loss 7.1137 (7.2646) grad_norm 3.2708 (2.6119) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:43:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][210/625] eta 0:04:03 lr 0.000346 wd 0.0500 time 0.5728 (0.5874) data time 0.0008 (0.0026) model time 0.5720 (0.5884) loss 7.7185 (7.2826) grad_norm 4.0485 (2.6632) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:43:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][220/625] eta 0:03:57 lr 0.000346 wd 0.0500 time 0.5756 (0.5868) data time 0.0008 (0.0025) model time 0.5749 (0.5876) loss 6.5469 (7.3004) grad_norm 3.0726 (2.6569) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:43:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][230/625] eta 0:03:51 lr 0.000346 wd 0.0500 time 0.5732 (0.5863) data time 0.0008 (0.0024) model time 0.5724 (0.5868) loss 7.0377 (7.2984) grad_norm 2.3182 (2.6733) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:43:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][240/625] eta 0:03:45 lr 0.000346 wd 0.0500 time 0.5639 (0.5858) data time 0.0009 (0.0024) model time 0.5630 (0.5861) loss 7.3675 (7.2863) grad_norm 2.1533 (2.6652) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:43:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][250/625] eta 0:03:39 lr 0.000346 wd 0.0500 time 0.5724 (0.5854) data time 0.0008 (0.0023) model time 0.5716 (0.5856) loss 6.6068 (7.2976) grad_norm 2.2205 (2.7241) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:43:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][260/625] eta 0:03:33 lr 0.000346 wd 0.0500 time 0.5712 (0.5856) data time 0.0008 (0.0022) model time 0.5704 (0.5857) loss 7.9716 (7.2958) grad_norm 1.8820 (2.7173) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:43:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][270/625] eta 0:03:27 lr 0.000346 wd 0.0500 time 0.5754 (0.5852) data time 0.0006 (0.0022) model time 0.5748 (0.5852) loss 8.2629 (7.2973) grad_norm 3.0218 (2.7933) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:43:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][280/625] eta 0:03:22 lr 0.000346 wd 0.0500 time 0.5713 (0.5858) data time 0.0008 (0.0021) model time 0.5706 (0.5860) loss 7.8746 (7.3107) grad_norm 2.0377 (2.7927) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:44:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][290/625] eta 0:03:16 lr 0.000345 wd 0.0500 time 0.5738 (0.5859) data time 0.0006 (0.0021) model time 0.5732 (0.5860) loss 6.5200 (7.2911) grad_norm 1.8431 (2.7722) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:44:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][300/625] eta 0:03:11 lr 0.000345 wd 0.0500 time 0.7822 (0.5878) data time 0.0006 (0.0021) model time 0.7816 (0.5883) loss 7.9678 (7.2840) grad_norm 2.2325 (2.7623) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:44:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][310/625] eta 0:03:05 lr 0.000345 wd 0.0500 time 0.6906 (0.5902) data time 0.0008 (0.0020) model time 0.6898 (0.5910) loss 9.2378 (7.2928) grad_norm 2.4196 (2.7590) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:44:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][320/625] eta 0:02:59 lr 0.000345 wd 0.0500 time 0.5735 (0.5901) data time 0.0009 (0.0020) model time 0.5727 (0.5909) loss 7.3990 (7.2866) grad_norm 3.3154 (2.7547) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:44:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][330/625] eta 0:02:54 lr 0.000345 wd 0.0500 time 0.5716 (0.5905) data time 0.0006 (0.0019) model time 0.5710 (0.5913) loss 7.1279 (7.2746) grad_norm 3.8504 (2.7600) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:44:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][340/625] eta 0:02:48 lr 0.000345 wd 0.0500 time 0.5749 (0.5904) data time 0.0008 (0.0019) model time 0.5741 (0.5911) loss 6.9465 (7.2677) grad_norm 2.0166 (2.7426) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:44:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][350/625] eta 0:02:42 lr 0.000345 wd 0.0500 time 0.5727 (0.5899) data time 0.0008 (0.0019) model time 0.5719 (0.5905) loss 7.7461 (7.2616) grad_norm 1.9260 (2.7322) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:44:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][360/625] eta 0:02:36 lr 0.000345 wd 0.0500 time 0.5738 (0.5894) data time 0.0008 (0.0019) model time 0.5731 (0.5899) loss 8.5090 (7.2711) grad_norm 2.0139 (2.7130) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:44:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][370/625] eta 0:02:30 lr 0.000345 wd 0.0500 time 0.5696 (0.5890) data time 0.0008 (0.0018) model time 0.5689 (0.5894) loss 6.6585 (7.2632) grad_norm 2.6676 (2.7060) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:44:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][380/625] eta 0:02:24 lr 0.000345 wd 0.0500 time 0.5740 (0.5886) data time 0.0006 (0.0018) model time 0.5734 (0.5889) loss 7.4299 (7.2676) grad_norm 2.8479 (2.7052) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:45:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][390/625] eta 0:02:18 lr 0.000345 wd 0.0500 time 0.5739 (0.5883) data time 0.0006 (0.0018) model time 0.5733 (0.5885) loss 7.3615 (7.2630) grad_norm 2.1401 (2.7510) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:45:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][400/625] eta 0:02:12 lr 0.000344 wd 0.0500 time 0.5607 (0.5880) data time 0.0006 (0.0017) model time 0.5601 (0.5881) loss 6.8027 (7.2614) grad_norm 1.9101 (2.7394) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:45:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][410/625] eta 0:02:06 lr 0.000344 wd 0.0500 time 0.5699 (0.5878) data time 0.0008 (0.0017) model time 0.5691 (0.5879) loss 8.1213 (7.2633) grad_norm 2.7581 (2.7321) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:45:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][420/625] eta 0:02:00 lr 0.000344 wd 0.0500 time 0.5702 (0.5875) data time 0.0006 (0.0017) model time 0.5696 (0.5875) loss 7.3165 (7.2649) grad_norm 2.8198 (2.7213) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:45:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][430/625] eta 0:01:54 lr 0.000344 wd 0.0500 time 0.5714 (0.5872) data time 0.0007 (0.0017) model time 0.5708 (0.5871) loss 7.8747 (7.2681) grad_norm 2.2321 (2.7146) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:45:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][440/625] eta 0:01:48 lr 0.000344 wd 0.0500 time 0.5743 (0.5869) data time 0.0007 (0.0017) model time 0.5736 (0.5868) loss 7.9802 (7.2638) grad_norm 2.1990 (2.7059) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:45:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][450/625] eta 0:01:42 lr 0.000344 wd 0.0500 time 0.5743 (0.5866) data time 0.0006 (0.0017) model time 0.5737 (0.5865) loss 7.8542 (7.2628) grad_norm 1.9864 (2.6966) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:45:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][460/625] eta 0:01:36 lr 0.000344 wd 0.0500 time 0.5659 (0.5864) data time 0.0006 (0.0016) model time 0.5653 (0.5862) loss 8.9826 (7.2586) grad_norm 1.8317 (2.6864) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:45:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][470/625] eta 0:01:30 lr 0.000344 wd 0.0500 time 0.5722 (0.5862) data time 0.0006 (0.0016) model time 0.5716 (0.5859) loss 7.9361 (7.2586) grad_norm 1.9693 (2.6934) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:45:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][480/625] eta 0:01:25 lr 0.000344 wd 0.0500 time 0.5719 (0.5862) data time 0.0008 (0.0016) model time 0.5711 (0.5860) loss 8.2772 (7.2615) grad_norm 2.3280 (2.6840) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:46:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][490/625] eta 0:01:19 lr 0.000344 wd 0.0500 time 0.5744 (0.5860) data time 0.0006 (0.0016) model time 0.5737 (0.5857) loss 5.8451 (7.2564) grad_norm 2.5355 (2.6762) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:46:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][500/625] eta 0:01:13 lr 0.000343 wd 0.0500 time 0.5745 (0.5862) data time 0.0008 (0.0016) model time 0.5737 (0.5859) loss 7.4891 (7.2471) grad_norm 2.4069 (2.6765) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:46:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][510/625] eta 0:01:07 lr 0.000343 wd 0.0500 time 0.7493 (0.5865) data time 0.0008 (0.0016) model time 0.7485 (0.5863) loss 5.9414 (7.2485) grad_norm 1.7346 (2.6675) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:46:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][520/625] eta 0:01:01 lr 0.000343 wd 0.0500 time 0.7686 (0.5877) data time 0.0007 (0.0015) model time 0.7679 (0.5876) loss 6.5449 (7.2516) grad_norm 1.5677 (2.6532) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:46:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][530/625] eta 0:00:55 lr 0.000343 wd 0.0500 time 0.6964 (0.5890) data time 0.0006 (0.0015) model time 0.6958 (0.5890) loss 6.3339 (7.2522) grad_norm 1.6668 (2.6465) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:46:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][540/625] eta 0:00:50 lr 0.000343 wd 0.0500 time 0.5748 (0.5890) data time 0.0007 (0.0015) model time 0.5740 (0.5889) loss 6.4967 (7.2566) grad_norm 1.8675 (2.6649) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:46:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][550/625] eta 0:00:44 lr 0.000343 wd 0.0500 time 0.7246 (0.5895) data time 0.0006 (0.0015) model time 0.7240 (0.5895) loss 6.8004 (7.2655) grad_norm 1.6781 (2.6577) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:46:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][560/625] eta 0:00:38 lr 0.000343 wd 0.0500 time 0.5702 (0.5893) data time 0.0006 (0.0015) model time 0.5696 (0.5892) loss 6.4423 (7.2577) grad_norm 1.6574 (2.6489) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:46:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][570/625] eta 0:00:32 lr 0.000343 wd 0.0500 time 0.5737 (0.5890) data time 0.0008 (0.0015) model time 0.5729 (0.5889) loss 7.3864 (7.2575) grad_norm 2.2753 (2.6453) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:46:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][580/625] eta 0:00:26 lr 0.000343 wd 0.0500 time 0.5707 (0.5887) data time 0.0008 (0.0015) model time 0.5699 (0.5886) loss 7.3549 (7.2624) grad_norm 1.9116 (2.6370) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:47:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][590/625] eta 0:00:20 lr 0.000343 wd 0.0500 time 0.5731 (0.5885) data time 0.0008 (0.0015) model time 0.5723 (0.5883) loss 7.3555 (7.2657) grad_norm 1.9696 (2.6321) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:47:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][600/625] eta 0:00:14 lr 0.000343 wd 0.0500 time 0.5737 (0.5883) data time 0.0008 (0.0014) model time 0.5730 (0.5880) loss 8.0374 (7.2672) grad_norm 1.9722 (2.6242) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:47:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][610/625] eta 0:00:08 lr 0.000342 wd 0.0500 time 0.5694 (0.5880) data time 0.0006 (0.0014) model time 0.5688 (0.5878) loss 5.9532 (7.2734) grad_norm 3.4568 (2.6212) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:47:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [200/300][620/625] eta 0:00:02 lr 0.000342 wd 0.0500 time 0.5715 (0.5878) data time 0.0004 (0.0014) model time 0.5711 (0.5875) loss 6.2849 (7.2763) grad_norm 2.7847 (2.6168) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:47:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 200 training takes 0:06:07 +[2024-07-25 10:47:20 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 10:47:22 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 10:47:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.479 (0.479) Loss 0.5176 (0.5176) Acc@1 90.234 (90.234) Acc@5 98.828 (98.828) Mem 22339MB +[2024-07-25 10:47:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.7666 (0.6323) Acc@1 82.422 (87.194) Acc@5 96.631 (97.909) Mem 22339MB +[2024-07-25 10:47:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8784 (0.7278) Acc@1 78.809 (84.324) Acc@5 95.996 (97.019) Mem 22339MB +[2024-07-25 10:47:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.931 Acc@5 97.025 +[2024-07-25 10:47:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.9% +[2024-07-25 10:47:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.815 (0.815) Loss 0.5029 (0.5029) Acc@1 90.283 (90.283) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-25 10:47:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.188) Loss 0.7520 (0.6216) Acc@1 83.447 (87.416) Acc@5 96.533 (97.998) Mem 22339MB +[2024-07-25 10:47:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.158) Loss 0.8691 (0.7141) Acc@1 78.906 (84.475) Acc@5 96.045 (97.126) Mem 22339MB +[2024-07-25 10:47:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.087 Acc@5 97.119 +[2024-07-25 10:47:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.1% +[2024-07-25 10:47:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][0/625] eta 0:14:33 lr 0.000342 wd 0.0500 time 1.3970 (1.3970) data time 0.5427 (0.5427) model time 0.0000 (0.0000) loss 8.1162 (8.1162) grad_norm 1.9206 (1.9206) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:47:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][10/625] eta 0:06:38 lr 0.000342 wd 0.0500 time 0.5745 (0.6484) data time 0.0006 (0.0501) model time 0.0000 (0.0000) loss 7.4325 (7.4621) grad_norm 2.1291 (2.7399) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:47:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][20/625] eta 0:06:10 lr 0.000342 wd 0.0500 time 0.5725 (0.6127) data time 0.0007 (0.0266) model time 0.0000 (0.0000) loss 6.2925 (7.2895) grad_norm 2.4492 (2.5125) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:47:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][30/625] eta 0:05:57 lr 0.000342 wd 0.0500 time 0.5775 (0.6006) data time 0.0009 (0.0183) model time 0.0000 (0.0000) loss 7.6370 (7.2220) grad_norm 1.9677 (2.4336) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:47:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][40/625] eta 0:05:50 lr 0.000342 wd 0.0500 time 0.5208 (0.5985) data time 0.0007 (0.0141) model time 0.0000 (0.0000) loss 8.4020 (7.2630) grad_norm 1.9509 (2.3991) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:47:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][50/625] eta 0:05:41 lr 0.000342 wd 0.0500 time 0.5776 (0.5939) data time 0.0008 (0.0115) model time 0.0000 (0.0000) loss 7.8583 (7.3687) grad_norm 3.2167 (2.3852) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:48:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][60/625] eta 0:05:33 lr 0.000342 wd 0.0500 time 0.5750 (0.5904) data time 0.0008 (0.0097) model time 0.5742 (0.5718) loss 7.7581 (7.3682) grad_norm 2.8023 (2.3977) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:48:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][70/625] eta 0:05:26 lr 0.000342 wd 0.0500 time 0.5814 (0.5883) data time 0.0006 (0.0086) model time 0.5808 (0.5729) loss 8.4392 (7.3988) grad_norm 1.9338 (2.3489) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:48:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][80/625] eta 0:05:19 lr 0.000342 wd 0.0500 time 0.5732 (0.5866) data time 0.0006 (0.0076) model time 0.5726 (0.5732) loss 7.2394 (7.4101) grad_norm 1.8906 (2.3653) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:48:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][90/625] eta 0:05:13 lr 0.000341 wd 0.0500 time 0.5794 (0.5857) data time 0.0006 (0.0069) model time 0.5787 (0.5742) loss 8.5694 (7.4266) grad_norm 2.0573 (2.3364) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:48:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][100/625] eta 0:05:08 lr 0.000341 wd 0.0500 time 0.7408 (0.5875) data time 0.0008 (0.0063) model time 0.7400 (0.5800) loss 6.9246 (7.3716) grad_norm 2.7636 (2.3449) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:48:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][110/625] eta 0:05:03 lr 0.000341 wd 0.0500 time 0.6904 (0.5896) data time 0.0007 (0.0058) model time 0.6897 (0.5850) loss 7.5190 (7.3804) grad_norm 3.4894 (2.4318) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:48:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][120/625] eta 0:04:59 lr 0.000341 wd 0.0500 time 0.7280 (0.5937) data time 0.0006 (0.0054) model time 0.7274 (0.5926) loss 7.1575 (7.3392) grad_norm 2.6756 (2.5229) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:48:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][130/625] eta 0:04:54 lr 0.000341 wd 0.0500 time 0.5724 (0.5959) data time 0.0006 (0.0050) model time 0.5718 (0.5963) loss 8.3503 (7.3359) grad_norm 2.3739 (2.5499) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:48:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][140/625] eta 0:04:48 lr 0.000341 wd 0.0500 time 0.6819 (0.5951) data time 0.0008 (0.0048) model time 0.6810 (0.5949) loss 7.4033 (7.3341) grad_norm 2.0604 (2.5604) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:48:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][150/625] eta 0:04:43 lr 0.000341 wd 0.0500 time 0.7052 (0.5966) data time 0.0008 (0.0045) model time 0.7044 (0.5971) loss 6.5314 (7.3209) grad_norm 2.2296 (2.5303) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:49:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][160/625] eta 0:04:36 lr 0.000341 wd 0.0500 time 0.5777 (0.5953) data time 0.0008 (0.0043) model time 0.5768 (0.5950) loss 8.7232 (7.3270) grad_norm 2.4577 (2.5403) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:49:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][170/625] eta 0:04:30 lr 0.000341 wd 0.0500 time 0.5755 (0.5941) data time 0.0008 (0.0041) model time 0.5747 (0.5932) loss 9.3752 (7.3420) grad_norm 1.8712 (2.5266) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:49:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][180/625] eta 0:04:23 lr 0.000341 wd 0.0500 time 0.5726 (0.5930) data time 0.0007 (0.0039) model time 0.5719 (0.5918) loss 6.3978 (7.3374) grad_norm 3.1441 (2.5103) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:49:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][190/625] eta 0:04:17 lr 0.000340 wd 0.0500 time 0.5742 (0.5921) data time 0.0008 (0.0037) model time 0.5733 (0.5905) loss 9.0216 (7.3404) grad_norm 2.2220 (2.4846) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:49:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][200/625] eta 0:04:11 lr 0.000340 wd 0.0500 time 0.5805 (0.5914) data time 0.0008 (0.0036) model time 0.5796 (0.5896) loss 7.6348 (7.3156) grad_norm 1.9756 (2.4885) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:49:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][210/625] eta 0:04:05 lr 0.000340 wd 0.0500 time 0.5747 (0.5906) data time 0.0007 (0.0035) model time 0.5740 (0.5886) loss 6.9006 (7.3121) grad_norm 2.0700 (2.4763) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:49:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][220/625] eta 0:03:58 lr 0.000340 wd 0.0500 time 0.5719 (0.5898) data time 0.0008 (0.0034) model time 0.5711 (0.5877) loss 6.1912 (7.3050) grad_norm 2.0421 (2.4565) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:49:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][230/625] eta 0:03:52 lr 0.000340 wd 0.0500 time 0.5760 (0.5892) data time 0.0008 (0.0032) model time 0.5752 (0.5870) loss 6.8191 (7.2987) grad_norm 4.1646 (2.4575) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:49:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][240/625] eta 0:03:46 lr 0.000340 wd 0.0500 time 0.5735 (0.5886) data time 0.0007 (0.0032) model time 0.5728 (0.5862) loss 7.1533 (7.3066) grad_norm 2.2686 (2.4705) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:49:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][250/625] eta 0:03:40 lr 0.000340 wd 0.0500 time 0.5753 (0.5881) data time 0.0007 (0.0031) model time 0.5746 (0.5857) loss 7.7909 (7.2897) grad_norm 2.3650 (2.4679) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:50:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][260/625] eta 0:03:34 lr 0.000340 wd 0.0500 time 0.5734 (0.5879) data time 0.0006 (0.0030) model time 0.5727 (0.5855) loss 7.6486 (7.2956) grad_norm 2.1839 (2.4617) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:50:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][270/625] eta 0:03:28 lr 0.000340 wd 0.0500 time 0.5776 (0.5874) data time 0.0008 (0.0029) model time 0.5768 (0.5850) loss 8.0596 (7.2952) grad_norm 2.5026 (2.4645) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:50:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][280/625] eta 0:03:22 lr 0.000340 wd 0.0500 time 0.5753 (0.5869) data time 0.0008 (0.0028) model time 0.5745 (0.5845) loss 6.7569 (7.2846) grad_norm 1.9356 (2.4564) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:50:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][290/625] eta 0:03:16 lr 0.000340 wd 0.0500 time 0.5740 (0.5865) data time 0.0006 (0.0028) model time 0.5734 (0.5840) loss 7.3141 (7.2928) grad_norm 2.0152 (2.4502) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:50:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][300/625] eta 0:03:10 lr 0.000339 wd 0.0500 time 0.5774 (0.5861) data time 0.0008 (0.0027) model time 0.5766 (0.5836) loss 6.3748 (7.2768) grad_norm 2.1967 (2.4418) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:50:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][310/625] eta 0:03:04 lr 0.000339 wd 0.0500 time 0.5732 (0.5858) data time 0.0006 (0.0027) model time 0.5726 (0.5832) loss 7.6370 (7.2831) grad_norm 2.5204 (2.4404) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:50:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][320/625] eta 0:02:58 lr 0.000339 wd 0.0500 time 0.6976 (0.5863) data time 0.0008 (0.0026) model time 0.6968 (0.5840) loss 6.1334 (7.2839) grad_norm 2.5070 (2.4403) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:50:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][330/625] eta 0:02:53 lr 0.000339 wd 0.0500 time 0.7512 (0.5875) data time 0.0008 (0.0025) model time 0.7504 (0.5854) loss 7.8204 (7.2913) grad_norm 1.6148 (2.4333) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:50:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][340/625] eta 0:02:47 lr 0.000339 wd 0.0500 time 0.6756 (0.5886) data time 0.0006 (0.0025) model time 0.6750 (0.5867) loss 6.8923 (7.2906) grad_norm 1.7555 (2.4360) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:50:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][350/625] eta 0:02:42 lr 0.000339 wd 0.0500 time 0.5714 (0.5906) data time 0.0008 (0.0024) model time 0.5705 (0.5891) loss 7.7731 (7.2826) grad_norm 2.6185 (2.4363) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:51:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][360/625] eta 0:02:36 lr 0.000339 wd 0.0500 time 0.6900 (0.5904) data time 0.0009 (0.0024) model time 0.6890 (0.5890) loss 9.1550 (7.2857) grad_norm 3.5555 (2.4494) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:51:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][370/625] eta 0:02:30 lr 0.000339 wd 0.0500 time 0.5740 (0.5908) data time 0.0008 (0.0024) model time 0.5732 (0.5894) loss 7.7117 (7.2922) grad_norm 2.9624 (2.4609) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:51:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][380/625] eta 0:02:24 lr 0.000339 wd 0.0500 time 0.5747 (0.5904) data time 0.0006 (0.0023) model time 0.5742 (0.5889) loss 8.2034 (7.2924) grad_norm 3.0315 (2.4612) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:51:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][390/625] eta 0:02:18 lr 0.000339 wd 0.0500 time 0.5744 (0.5900) data time 0.0006 (0.0023) model time 0.5738 (0.5885) loss 7.6802 (7.3014) grad_norm 3.1949 (2.4649) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:51:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][400/625] eta 0:02:12 lr 0.000338 wd 0.0500 time 0.5751 (0.5896) data time 0.0009 (0.0022) model time 0.5742 (0.5881) loss 6.6704 (7.2959) grad_norm 3.3342 (2.4847) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:51:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][410/625] eta 0:02:06 lr 0.000338 wd 0.0500 time 0.5751 (0.5893) data time 0.0007 (0.0022) model time 0.5744 (0.5878) loss 6.0262 (7.2886) grad_norm 3.9533 (2.5145) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:51:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][420/625] eta 0:02:00 lr 0.000338 wd 0.0500 time 0.5786 (0.5890) data time 0.0008 (0.0022) model time 0.5778 (0.5875) loss 8.9536 (7.2986) grad_norm 3.0450 (2.5242) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:51:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][430/625] eta 0:01:54 lr 0.000338 wd 0.0500 time 0.5738 (0.5887) data time 0.0007 (0.0022) model time 0.5731 (0.5871) loss 7.3908 (7.3012) grad_norm 1.9597 (2.5242) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:51:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][440/625] eta 0:01:48 lr 0.000338 wd 0.0500 time 0.5749 (0.5884) data time 0.0008 (0.0021) model time 0.5742 (0.5868) loss 6.8516 (7.2961) grad_norm 2.5738 (2.5340) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:51:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][450/625] eta 0:01:42 lr 0.000338 wd 0.0500 time 0.5747 (0.5881) data time 0.0008 (0.0021) model time 0.5739 (0.5864) loss 6.0883 (7.3002) grad_norm 2.0959 (2.5368) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:52:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][460/625] eta 0:01:36 lr 0.000338 wd 0.0500 time 0.5765 (0.5878) data time 0.0008 (0.0021) model time 0.5757 (0.5861) loss 8.1737 (7.3083) grad_norm 2.0572 (2.5276) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:52:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][470/625] eta 0:01:31 lr 0.000338 wd 0.0500 time 0.5769 (0.5875) data time 0.0007 (0.0020) model time 0.5762 (0.5858) loss 8.8011 (7.3184) grad_norm 2.2355 (2.5255) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:52:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][480/625] eta 0:01:25 lr 0.000338 wd 0.0500 time 0.5742 (0.5875) data time 0.0008 (0.0020) model time 0.5735 (0.5859) loss 6.1816 (7.3181) grad_norm 3.2124 (2.5268) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:52:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][490/625] eta 0:01:19 lr 0.000338 wd 0.0500 time 0.5766 (0.5873) data time 0.0008 (0.0020) model time 0.5758 (0.5856) loss 8.1527 (7.3223) grad_norm 2.9358 (2.5254) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:52:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][500/625] eta 0:01:13 lr 0.000338 wd 0.0500 time 0.5760 (0.5870) data time 0.0006 (0.0020) model time 0.5754 (0.5853) loss 6.0420 (7.3083) grad_norm 2.2117 (2.5152) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:52:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][510/625] eta 0:01:07 lr 0.000337 wd 0.0500 time 0.5735 (0.5867) data time 0.0008 (0.0020) model time 0.5727 (0.5850) loss 6.1872 (7.3033) grad_norm 1.8112 (2.5541) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:52:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][520/625] eta 0:01:01 lr 0.000337 wd 0.0500 time 0.5760 (0.5865) data time 0.0008 (0.0019) model time 0.5752 (0.5848) loss 7.8383 (7.3118) grad_norm 3.0549 (2.5553) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:52:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][530/625] eta 0:00:55 lr 0.000337 wd 0.0500 time 0.5777 (0.5863) data time 0.0010 (0.0019) model time 0.5767 (0.5846) loss 7.5364 (7.3101) grad_norm 2.8030 (2.5528) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:52:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][540/625] eta 0:00:49 lr 0.000337 wd 0.0500 time 0.5735 (0.5868) data time 0.0006 (0.0019) model time 0.5728 (0.5852) loss 7.8105 (7.3151) grad_norm 2.0864 (2.5480) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:52:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][550/625] eta 0:00:44 lr 0.000337 wd 0.0500 time 0.7361 (0.5874) data time 0.0008 (0.0019) model time 0.7353 (0.5859) loss 7.4053 (7.3103) grad_norm 1.9811 (2.5428) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:52:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][560/625] eta 0:00:38 lr 0.000337 wd 0.0500 time 0.7614 (0.5886) data time 0.0008 (0.0019) model time 0.7607 (0.5872) loss 7.1763 (7.3129) grad_norm 1.9871 (2.5355) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:53:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][570/625] eta 0:00:32 lr 0.000337 wd 0.0500 time 0.5782 (0.5892) data time 0.0006 (0.0018) model time 0.5776 (0.5879) loss 7.3985 (7.3128) grad_norm 1.7728 (2.5274) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:53:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][580/625] eta 0:00:26 lr 0.000337 wd 0.0500 time 0.7273 (0.5892) data time 0.0006 (0.0018) model time 0.7268 (0.5879) loss 7.9395 (7.3119) grad_norm 2.6931 (2.5287) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:53:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][590/625] eta 0:00:20 lr 0.000337 wd 0.0500 time 0.5747 (0.5894) data time 0.0006 (0.0018) model time 0.5741 (0.5881) loss 6.3034 (7.3068) grad_norm 2.3414 (2.5246) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:53:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][600/625] eta 0:00:14 lr 0.000337 wd 0.0500 time 0.5753 (0.5892) data time 0.0008 (0.0018) model time 0.5745 (0.5878) loss 8.1548 (7.3054) grad_norm 1.6434 (2.5245) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:53:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][610/625] eta 0:00:08 lr 0.000336 wd 0.0500 time 0.5737 (0.5889) data time 0.0004 (0.0018) model time 0.5734 (0.5876) loss 5.5837 (7.2963) grad_norm 3.5506 (2.5263) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:53:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [201/300][620/625] eta 0:00:02 lr 0.000336 wd 0.0500 time 0.5745 (0.5887) data time 0.0004 (0.0018) model time 0.5741 (0.5873) loss 6.3351 (7.2989) grad_norm 2.7983 (2.5259) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:53:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 201 training takes 0:06:07 +[2024-07-25 10:53:37 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 10:53:38 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 10:53:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.477 (0.477) Loss 0.5117 (0.5117) Acc@1 90.137 (90.137) Acc@5 98.828 (98.828) Mem 22339MB +[2024-07-25 10:53:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.7930 (0.6288) Acc@1 81.445 (87.154) Acc@5 96.533 (97.896) Mem 22339MB +[2024-07-25 10:53:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8916 (0.7263) Acc@1 78.174 (84.268) Acc@5 95.850 (97.024) Mem 22339MB +[2024-07-25 10:53:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.905 Acc@5 97.003 +[2024-07-25 10:53:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 83.9% +[2024-07-25 10:53:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.835 (0.835) Loss 0.5029 (0.5029) Acc@1 90.283 (90.283) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-25 10:53:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.190) Loss 0.7520 (0.6216) Acc@1 83.398 (87.447) Acc@5 96.533 (97.994) Mem 22339MB +[2024-07-25 10:53:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.159) Loss 0.8687 (0.7139) Acc@1 78.955 (84.496) Acc@5 95.996 (97.117) Mem 22339MB +[2024-07-25 10:53:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.113 Acc@5 97.103 +[2024-07-25 10:53:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.1% +[2024-07-25 10:53:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.11% +[2024-07-25 10:53:46 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 10:53:47 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 10:53:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][0/625] eta 0:09:07 lr 0.000336 wd 0.0500 time 0.8763 (0.8763) data time 0.3590 (0.3590) model time 0.0000 (0.0000) loss 7.4263 (7.4263) grad_norm 4.0587 (4.0587) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:53:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][10/625] eta 0:06:12 lr 0.000336 wd 0.0500 time 0.5666 (0.6060) data time 0.0008 (0.0334) model time 0.0000 (0.0000) loss 8.7029 (6.9055) grad_norm 4.7341 (3.0963) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:53:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][20/625] eta 0:05:57 lr 0.000336 wd 0.0500 time 0.5711 (0.5902) data time 0.0008 (0.0179) model time 0.0000 (0.0000) loss 5.4404 (6.9567) grad_norm 2.0401 (2.8238) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:54:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][30/625] eta 0:05:47 lr 0.000336 wd 0.0500 time 0.5724 (0.5847) data time 0.0008 (0.0123) model time 0.0000 (0.0000) loss 8.6460 (7.1043) grad_norm 2.8145 (2.8801) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:54:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][40/625] eta 0:05:40 lr 0.000336 wd 0.0500 time 0.5712 (0.5820) data time 0.0007 (0.0095) model time 0.0000 (0.0000) loss 7.8134 (7.1229) grad_norm 1.8180 (2.8577) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:54:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][50/625] eta 0:05:33 lr 0.000336 wd 0.0500 time 0.5726 (0.5803) data time 0.0008 (0.0079) model time 0.0000 (0.0000) loss 7.0734 (7.2157) grad_norm 2.6954 (2.7902) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:54:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][60/625] eta 0:05:27 lr 0.000336 wd 0.0500 time 0.5714 (0.5793) data time 0.0008 (0.0067) model time 0.5706 (0.5729) loss 7.6837 (7.2312) grad_norm 2.0890 (2.7233) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:54:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][70/625] eta 0:05:21 lr 0.000336 wd 0.0500 time 0.5733 (0.5785) data time 0.0008 (0.0059) model time 0.5725 (0.5731) loss 6.6345 (7.2483) grad_norm 1.6214 (2.7296) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:54:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][80/625] eta 0:05:15 lr 0.000336 wd 0.0500 time 0.5706 (0.5780) data time 0.0008 (0.0053) model time 0.5698 (0.5732) loss 7.8294 (7.3180) grad_norm 2.2047 (2.6435) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:54:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][90/625] eta 0:05:09 lr 0.000335 wd 0.0500 time 0.5714 (0.5777) data time 0.0008 (0.0048) model time 0.5706 (0.5734) loss 8.3181 (7.3170) grad_norm 1.6195 (2.5755) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:54:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][100/625] eta 0:05:03 lr 0.000335 wd 0.0500 time 0.5715 (0.5774) data time 0.0006 (0.0044) model time 0.5709 (0.5735) loss 7.0080 (7.2615) grad_norm 1.7882 (2.5320) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:54:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][110/625] eta 0:04:57 lr 0.000335 wd 0.0500 time 0.5724 (0.5771) data time 0.0008 (0.0041) model time 0.5716 (0.5734) loss 9.3644 (7.2731) grad_norm 2.3414 (2.5529) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:54:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][120/625] eta 0:04:51 lr 0.000335 wd 0.0500 time 0.5731 (0.5770) data time 0.0008 (0.0038) model time 0.5723 (0.5736) loss 7.2441 (7.2659) grad_norm 2.3263 (2.5697) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:55:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][130/625] eta 0:04:46 lr 0.000335 wd 0.0500 time 0.5727 (0.5785) data time 0.0009 (0.0036) model time 0.5717 (0.5764) loss 6.7676 (7.2688) grad_norm 2.2142 (2.5578) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:55:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][140/625] eta 0:04:40 lr 0.000335 wd 0.0500 time 0.5747 (0.5786) data time 0.0008 (0.0034) model time 0.5739 (0.5767) loss 7.3484 (7.2589) grad_norm 2.9922 (2.5494) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:55:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][150/625] eta 0:04:36 lr 0.000335 wd 0.0500 time 0.6855 (0.5821) data time 0.0007 (0.0032) model time 0.6847 (0.5820) loss 7.7500 (7.2650) grad_norm 2.2095 (2.5475) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:55:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][160/625] eta 0:04:32 lr 0.000335 wd 0.0500 time 0.7425 (0.5853) data time 0.0006 (0.0031) model time 0.7419 (0.5868) loss 6.8896 (7.2823) grad_norm 3.1838 (2.5403) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:55:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][170/625] eta 0:04:26 lr 0.000335 wd 0.0500 time 0.5731 (0.5856) data time 0.0006 (0.0030) model time 0.5725 (0.5869) loss 6.7688 (7.2771) grad_norm 1.9037 (2.5324) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:55:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][180/625] eta 0:04:21 lr 0.000335 wd 0.0500 time 0.5700 (0.5874) data time 0.0006 (0.0028) model time 0.5695 (0.5893) loss 7.1445 (7.2588) grad_norm 2.4138 (2.5134) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:55:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][190/625] eta 0:04:15 lr 0.000335 wd 0.0500 time 0.5717 (0.5880) data time 0.0008 (0.0027) model time 0.5709 (0.5900) loss 6.6657 (7.2669) grad_norm 2.2691 (2.5147) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:55:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][200/625] eta 0:04:09 lr 0.000334 wd 0.0500 time 0.5748 (0.5874) data time 0.0008 (0.0026) model time 0.5740 (0.5889) loss 7.0368 (7.2926) grad_norm 3.0987 (2.4995) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:55:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][210/625] eta 0:04:03 lr 0.000334 wd 0.0500 time 0.5691 (0.5867) data time 0.0006 (0.0026) model time 0.5685 (0.5879) loss 7.3075 (7.3001) grad_norm 2.0019 (2.5558) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:55:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][220/625] eta 0:03:57 lr 0.000334 wd 0.0500 time 0.5750 (0.5862) data time 0.0008 (0.0025) model time 0.5742 (0.5871) loss 8.6674 (7.2890) grad_norm 2.0040 (2.5356) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:56:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][230/625] eta 0:03:51 lr 0.000334 wd 0.0500 time 0.5710 (0.5862) data time 0.0008 (0.0024) model time 0.5702 (0.5870) loss 7.5904 (7.2712) grad_norm 4.4199 (2.5324) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:56:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][240/625] eta 0:03:45 lr 0.000334 wd 0.0500 time 0.5701 (0.5857) data time 0.0009 (0.0024) model time 0.5692 (0.5863) loss 7.4393 (7.2861) grad_norm 1.9358 (2.5246) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:56:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][250/625] eta 0:03:39 lr 0.000334 wd 0.0500 time 0.5727 (0.5853) data time 0.0006 (0.0023) model time 0.5722 (0.5856) loss 6.3340 (7.2817) grad_norm 2.1385 (2.5174) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:56:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][260/625] eta 0:03:33 lr 0.000334 wd 0.0500 time 0.5734 (0.5849) data time 0.0006 (0.0022) model time 0.5728 (0.5851) loss 6.5168 (7.2781) grad_norm 2.0159 (2.5036) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:56:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][270/625] eta 0:03:27 lr 0.000334 wd 0.0500 time 0.5751 (0.5845) data time 0.0008 (0.0022) model time 0.5743 (0.5846) loss 7.5289 (7.2895) grad_norm 2.3937 (2.4996) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:56:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][280/625] eta 0:03:21 lr 0.000334 wd 0.0500 time 0.5706 (0.5841) data time 0.0008 (0.0021) model time 0.5698 (0.5841) loss 6.6819 (7.2825) grad_norm 2.6050 (2.4923) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:56:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][290/625] eta 0:03:15 lr 0.000334 wd 0.0500 time 0.5722 (0.5838) data time 0.0006 (0.0021) model time 0.5716 (0.5837) loss 7.5472 (7.2963) grad_norm 2.8584 (2.4838) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:56:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][300/625] eta 0:03:09 lr 0.000333 wd 0.0500 time 0.5716 (0.5835) data time 0.0008 (0.0021) model time 0.5708 (0.5833) loss 6.7293 (7.3051) grad_norm 3.2639 (2.4985) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:56:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][310/625] eta 0:03:03 lr 0.000333 wd 0.0500 time 0.5749 (0.5832) data time 0.0008 (0.0020) model time 0.5741 (0.5829) loss 5.9420 (7.2882) grad_norm 3.2170 (2.4927) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:56:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][320/625] eta 0:02:57 lr 0.000333 wd 0.0500 time 0.5749 (0.5829) data time 0.0006 (0.0020) model time 0.5743 (0.5825) loss 6.5019 (7.2832) grad_norm 2.3714 (2.4880) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:57:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][330/625] eta 0:02:51 lr 0.000333 wd 0.0500 time 0.5761 (0.5827) data time 0.0007 (0.0020) model time 0.5753 (0.5823) loss 7.7121 (7.2891) grad_norm 2.8463 (2.4980) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:57:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][340/625] eta 0:02:46 lr 0.000333 wd 0.0500 time 0.5728 (0.5825) data time 0.0006 (0.0019) model time 0.5722 (0.5820) loss 7.6583 (7.2978) grad_norm 2.4360 (2.5240) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:57:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][350/625] eta 0:02:40 lr 0.000333 wd 0.0500 time 0.5684 (0.5827) data time 0.0007 (0.0019) model time 0.5677 (0.5822) loss 6.3032 (7.3010) grad_norm 1.9456 (2.5190) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:57:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][360/625] eta 0:02:34 lr 0.000333 wd 0.0500 time 0.6198 (0.5827) data time 0.0007 (0.0019) model time 0.6192 (0.5822) loss 6.0807 (7.3039) grad_norm 2.1403 (2.5248) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:57:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][370/625] eta 0:02:28 lr 0.000333 wd 0.0500 time 0.7283 (0.5840) data time 0.0006 (0.0018) model time 0.7276 (0.5837) loss 8.2224 (7.3026) grad_norm 1.8922 (2.5732) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:57:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][380/625] eta 0:02:23 lr 0.000333 wd 0.0500 time 0.6809 (0.5857) data time 0.0008 (0.0018) model time 0.6801 (0.5857) loss 7.3187 (7.3061) grad_norm 2.1858 (2.5681) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:57:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][390/625] eta 0:02:17 lr 0.000333 wd 0.0500 time 0.5715 (0.5860) data time 0.0008 (0.0018) model time 0.5707 (0.5860) loss 6.6824 (7.3078) grad_norm 2.4759 (2.5600) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:57:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][400/625] eta 0:02:11 lr 0.000333 wd 0.0500 time 0.5700 (0.5865) data time 0.0008 (0.0018) model time 0.5692 (0.5865) loss 7.5279 (7.3127) grad_norm 1.6740 (2.5544) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:57:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][410/625] eta 0:02:06 lr 0.000332 wd 0.0500 time 0.5760 (0.5868) data time 0.0009 (0.0017) model time 0.5752 (0.5868) loss 7.2942 (7.3190) grad_norm 2.1657 (2.5499) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:57:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][420/625] eta 0:02:00 lr 0.000332 wd 0.0500 time 0.5733 (0.5865) data time 0.0008 (0.0017) model time 0.5725 (0.5864) loss 6.9919 (7.3195) grad_norm 1.9852 (2.5404) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:58:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][430/625] eta 0:01:54 lr 0.000332 wd 0.0500 time 0.5688 (0.5861) data time 0.0008 (0.0017) model time 0.5680 (0.5860) loss 6.6290 (7.3176) grad_norm 2.9414 (2.5273) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:58:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][440/625] eta 0:01:48 lr 0.000332 wd 0.0500 time 0.5750 (0.5859) data time 0.0006 (0.0017) model time 0.5743 (0.5857) loss 7.2775 (7.3161) grad_norm 4.5550 (2.5360) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:58:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][450/625] eta 0:01:42 lr 0.000332 wd 0.0500 time 0.5711 (0.5857) data time 0.0007 (0.0017) model time 0.5704 (0.5855) loss 7.6850 (7.3146) grad_norm 2.0334 (2.5616) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:58:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][460/625] eta 0:01:36 lr 0.000332 wd 0.0500 time 0.5702 (0.5855) data time 0.0008 (0.0017) model time 0.5693 (0.5852) loss 8.4274 (7.3179) grad_norm 2.1695 (2.5681) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:58:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][470/625] eta 0:01:30 lr 0.000332 wd 0.0500 time 0.5740 (0.5852) data time 0.0006 (0.0016) model time 0.5734 (0.5849) loss 6.3517 (7.3137) grad_norm 2.6103 (2.5712) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:58:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][480/625] eta 0:01:24 lr 0.000332 wd 0.0500 time 0.5756 (0.5850) data time 0.0008 (0.0016) model time 0.5748 (0.5847) loss 8.0846 (7.3187) grad_norm 2.6831 (2.5657) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:58:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][490/625] eta 0:01:18 lr 0.000332 wd 0.0500 time 0.5700 (0.5848) data time 0.0006 (0.0016) model time 0.5693 (0.5844) loss 7.8542 (7.3199) grad_norm 2.4535 (2.5795) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:58:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][500/625] eta 0:01:13 lr 0.000332 wd 0.0500 time 0.5730 (0.5846) data time 0.0006 (0.0016) model time 0.5724 (0.5842) loss 6.8430 (7.3149) grad_norm 2.5937 (2.5976) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:58:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][510/625] eta 0:01:07 lr 0.000331 wd 0.0500 time 0.5716 (0.5844) data time 0.0007 (0.0016) model time 0.5710 (0.5839) loss 5.5151 (7.3155) grad_norm 1.6452 (2.5922) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:58:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][520/625] eta 0:01:01 lr 0.000331 wd 0.0500 time 0.5706 (0.5842) data time 0.0007 (0.0016) model time 0.5699 (0.5837) loss 6.5126 (7.3149) grad_norm 1.9458 (2.6019) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:58:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][530/625] eta 0:00:55 lr 0.000331 wd 0.0500 time 0.5701 (0.5840) data time 0.0006 (0.0016) model time 0.5695 (0.5835) loss 8.2577 (7.3212) grad_norm 2.0383 (2.5917) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:59:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][540/625] eta 0:00:49 lr 0.000331 wd 0.0500 time 0.5691 (0.5838) data time 0.0006 (0.0015) model time 0.5686 (0.5832) loss 8.0046 (7.3287) grad_norm 1.8904 (2.5853) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:59:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][550/625] eta 0:00:43 lr 0.000331 wd 0.0500 time 0.5725 (0.5836) data time 0.0008 (0.0015) model time 0.5717 (0.5831) loss 8.8624 (7.3299) grad_norm 1.9181 (2.5751) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 10:59:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][560/625] eta 0:00:37 lr 0.000331 wd 0.0500 time 0.5724 (0.5835) data time 0.0008 (0.0015) model time 0.5716 (0.5829) loss 7.9147 (7.3302) grad_norm 2.5538 (2.5781) loss_scale 1024.0000 (516.5633) mem 22339MB +[2024-07-25 10:59:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][570/625] eta 0:00:32 lr 0.000331 wd 0.0500 time 0.5710 (0.5835) data time 0.0007 (0.0015) model time 0.5703 (0.5830) loss 7.3855 (7.3268) grad_norm 4.1698 (2.5882) loss_scale 1024.0000 (525.4501) mem 22339MB +[2024-07-25 10:59:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][580/625] eta 0:00:26 lr 0.000331 wd 0.0500 time 0.6992 (0.5839) data time 0.0006 (0.0015) model time 0.6986 (0.5833) loss 7.6361 (7.3284) grad_norm 2.7558 (2.5874) loss_scale 1024.0000 (534.0310) mem 22339MB +[2024-07-25 10:59:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][590/625] eta 0:00:20 lr 0.000331 wd 0.0500 time 0.5713 (0.5844) data time 0.0008 (0.0015) model time 0.5705 (0.5839) loss 6.5655 (7.3252) grad_norm 2.4158 (2.5866) loss_scale 1024.0000 (542.3215) mem 22339MB +[2024-07-25 10:59:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][600/625] eta 0:00:14 lr 0.000331 wd 0.0500 time 0.7427 (0.5854) data time 0.0006 (0.0015) model time 0.7421 (0.5850) loss 6.9851 (7.3297) grad_norm 2.9637 (2.5928) loss_scale 1024.0000 (550.3361) mem 22339MB +[2024-07-25 10:59:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][610/625] eta 0:00:08 lr 0.000331 wd 0.0500 time 0.5646 (0.5859) data time 0.0006 (0.0015) model time 0.5641 (0.5855) loss 7.9267 (7.3321) grad_norm 2.3035 (2.5936) loss_scale 1024.0000 (558.0884) mem 22339MB +[2024-07-25 10:59:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [202/300][620/625] eta 0:00:02 lr 0.000330 wd 0.0500 time 0.5947 (0.5860) data time 0.0005 (0.0015) model time 0.5941 (0.5857) loss 7.1937 (7.3338) grad_norm 2.1482 (2.5952) loss_scale 1024.0000 (565.5910) mem 22339MB +[2024-07-25 10:59:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 202 training takes 0:06:06 +[2024-07-25 10:59:53 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 10:59:55 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 10:59:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.481 (0.481) Loss 0.5142 (0.5142) Acc@1 90.625 (90.625) Acc@5 98.730 (98.730) Mem 22339MB +[2024-07-25 10:59:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.159) Loss 0.7729 (0.6307) Acc@1 82.471 (87.243) Acc@5 97.070 (97.994) Mem 22339MB +[2024-07-25 10:59:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.8662 (0.7259) Acc@1 79.004 (84.401) Acc@5 95.801 (97.070) Mem 22339MB +[2024-07-25 10:59:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.027 Acc@5 97.051 +[2024-07-25 10:59:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.0% +[2024-07-25 10:59:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 84.03% +[2024-07-25 10:59:58 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 11:00:00 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 11:00:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.477 (0.477) Loss 0.5034 (0.5034) Acc@1 90.332 (90.332) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 11:00:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.7529 (0.6220) Acc@1 83.447 (87.460) Acc@5 96.631 (98.002) Mem 22339MB +[2024-07-25 11:00:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8687 (0.7139) Acc@1 79.102 (84.519) Acc@5 95.996 (97.135) Mem 22339MB +[2024-07-25 11:00:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.131 Acc@5 97.119 +[2024-07-25 11:00:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.1% +[2024-07-25 11:00:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.13% +[2024-07-25 11:00:04 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 11:00:05 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 11:00:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][0/625] eta 0:09:48 lr 0.000330 wd 0.0500 time 0.9413 (0.9413) data time 0.4213 (0.4213) model time 0.0000 (0.0000) loss 7.9004 (7.9004) grad_norm 2.9148 (2.9148) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:00:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][10/625] eta 0:06:22 lr 0.000330 wd 0.0500 time 0.5713 (0.6220) data time 0.0008 (0.0391) model time 0.0000 (0.0000) loss 7.9936 (7.4736) grad_norm 1.8010 (2.2187) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:00:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][20/625] eta 0:06:02 lr 0.000330 wd 0.0500 time 0.5704 (0.5991) data time 0.0008 (0.0209) model time 0.0000 (0.0000) loss 7.3788 (7.3754) grad_norm 2.0510 (2.2189) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:00:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][30/625] eta 0:05:51 lr 0.000330 wd 0.0500 time 0.5727 (0.5913) data time 0.0008 (0.0144) model time 0.0000 (0.0000) loss 7.3939 (7.4921) grad_norm 2.1043 (2.1780) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:00:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][40/625] eta 0:05:43 lr 0.000330 wd 0.0500 time 0.5734 (0.5870) data time 0.0007 (0.0111) model time 0.0000 (0.0000) loss 6.8335 (7.5433) grad_norm 1.4537 (2.1778) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:00:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][50/625] eta 0:05:36 lr 0.000330 wd 0.0500 time 0.5634 (0.5845) data time 0.0006 (0.0091) model time 0.0000 (0.0000) loss 7.4655 (7.4971) grad_norm 2.7852 (2.1746) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:00:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][60/625] eta 0:05:29 lr 0.000330 wd 0.0500 time 0.5629 (0.5829) data time 0.0008 (0.0078) model time 0.5621 (0.5739) loss 7.0279 (7.4813) grad_norm 3.2894 (2.2381) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:00:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][70/625] eta 0:05:22 lr 0.000330 wd 0.0500 time 0.5679 (0.5819) data time 0.0006 (0.0068) model time 0.5672 (0.5743) loss 6.3619 (7.4906) grad_norm 4.6093 (2.2665) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:00:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][80/625] eta 0:05:16 lr 0.000330 wd 0.0500 time 0.5667 (0.5811) data time 0.0007 (0.0060) model time 0.5660 (0.5745) loss 7.3751 (7.5103) grad_norm 3.6026 (2.2903) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:00:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][90/625] eta 0:05:10 lr 0.000330 wd 0.0500 time 0.5635 (0.5804) data time 0.0006 (0.0055) model time 0.5628 (0.5743) loss 7.6587 (7.4863) grad_norm 3.1865 (2.3059) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:01:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][100/625] eta 0:05:04 lr 0.000329 wd 0.0500 time 0.5643 (0.5798) data time 0.0006 (0.0050) model time 0.5636 (0.5741) loss 8.7548 (7.5027) grad_norm 2.2129 (2.2876) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:01:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][110/625] eta 0:04:58 lr 0.000329 wd 0.0500 time 0.5731 (0.5793) data time 0.0007 (0.0046) model time 0.5723 (0.5741) loss 7.3178 (7.5000) grad_norm 3.4061 (2.2762) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:01:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][120/625] eta 0:04:52 lr 0.000329 wd 0.0500 time 0.5646 (0.5790) data time 0.0006 (0.0043) model time 0.5640 (0.5742) loss 6.3705 (7.4639) grad_norm 2.2993 (2.2758) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:01:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][130/625] eta 0:04:46 lr 0.000329 wd 0.0500 time 0.5733 (0.5790) data time 0.0007 (0.0041) model time 0.5726 (0.5746) loss 7.6352 (7.4411) grad_norm 2.6678 (2.2935) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:01:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][140/625] eta 0:04:40 lr 0.000329 wd 0.0500 time 0.5702 (0.5786) data time 0.0006 (0.0038) model time 0.5696 (0.5745) loss 6.3749 (7.4297) grad_norm 6.0390 (2.3398) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:01:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][150/625] eta 0:04:34 lr 0.000329 wd 0.0500 time 0.5708 (0.5783) data time 0.0008 (0.0036) model time 0.5700 (0.5743) loss 8.0574 (7.4032) grad_norm 2.3137 (inf) loss_scale 512.0000 (1013.8278) mem 22339MB +[2024-07-25 11:01:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][160/625] eta 0:04:29 lr 0.000329 wd 0.0500 time 0.7081 (0.5790) data time 0.0007 (0.0035) model time 0.7074 (0.5756) loss 8.4957 (7.4172) grad_norm 4.3037 (inf) loss_scale 512.0000 (982.6584) mem 22339MB +[2024-07-25 11:01:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][170/625] eta 0:04:23 lr 0.000329 wd 0.0500 time 0.5729 (0.5797) data time 0.0006 (0.0033) model time 0.5723 (0.5768) loss 6.5379 (7.3875) grad_norm 2.1279 (inf) loss_scale 512.0000 (955.1345) mem 22339MB +[2024-07-25 11:01:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][180/625] eta 0:04:18 lr 0.000329 wd 0.0500 time 0.5667 (0.5817) data time 0.0008 (0.0032) model time 0.5659 (0.5798) loss 7.9010 (7.3844) grad_norm 1.6371 (inf) loss_scale 512.0000 (930.6519) mem 22339MB +[2024-07-25 11:01:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][190/625] eta 0:04:14 lr 0.000329 wd 0.0500 time 0.5699 (0.5845) data time 0.0006 (0.0031) model time 0.5693 (0.5837) loss 7.0931 (7.3742) grad_norm 4.0708 (inf) loss_scale 512.0000 (908.7330) mem 22339MB +[2024-07-25 11:02:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][200/625] eta 0:04:10 lr 0.000329 wd 0.0500 time 0.5648 (0.5888) data time 0.0007 (0.0030) model time 0.5641 (0.5894) loss 7.1275 (7.3863) grad_norm 3.1816 (inf) loss_scale 512.0000 (888.9950) mem 22339MB +[2024-07-25 11:02:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][210/625] eta 0:04:04 lr 0.000328 wd 0.0500 time 0.5696 (0.5882) data time 0.0006 (0.0029) model time 0.5690 (0.5885) loss 7.7111 (7.3955) grad_norm 2.7400 (inf) loss_scale 512.0000 (871.1280) mem 22339MB +[2024-07-25 11:02:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][220/625] eta 0:03:59 lr 0.000328 wd 0.0500 time 0.6325 (0.5905) data time 0.0008 (0.0028) model time 0.6317 (0.5914) loss 7.8359 (7.3775) grad_norm 1.7615 (inf) loss_scale 512.0000 (854.8778) mem 22339MB +[2024-07-25 11:02:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][230/625] eta 0:03:52 lr 0.000328 wd 0.0500 time 0.5619 (0.5898) data time 0.0006 (0.0027) model time 0.5613 (0.5905) loss 6.9007 (7.3807) grad_norm 2.1712 (inf) loss_scale 512.0000 (840.0346) mem 22339MB +[2024-07-25 11:02:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][240/625] eta 0:03:46 lr 0.000328 wd 0.0500 time 0.5734 (0.5892) data time 0.0008 (0.0026) model time 0.5725 (0.5896) loss 7.4680 (7.3910) grad_norm 1.6923 (inf) loss_scale 512.0000 (826.4232) mem 22339MB +[2024-07-25 11:02:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][250/625] eta 0:03:40 lr 0.000328 wd 0.0500 time 0.5713 (0.5887) data time 0.0006 (0.0025) model time 0.5707 (0.5888) loss 6.4949 (7.3699) grad_norm 3.1671 (inf) loss_scale 512.0000 (813.8964) mem 22339MB +[2024-07-25 11:02:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][260/625] eta 0:03:34 lr 0.000328 wd 0.0500 time 0.5723 (0.5881) data time 0.0008 (0.0025) model time 0.5716 (0.5881) loss 7.8094 (7.3563) grad_norm 1.6895 (inf) loss_scale 512.0000 (802.3295) mem 22339MB +[2024-07-25 11:02:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][270/625] eta 0:03:28 lr 0.000328 wd 0.0500 time 0.5713 (0.5876) data time 0.0006 (0.0024) model time 0.5707 (0.5874) loss 6.5860 (7.3540) grad_norm 2.4003 (inf) loss_scale 512.0000 (791.6162) mem 22339MB +[2024-07-25 11:02:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][280/625] eta 0:03:22 lr 0.000328 wd 0.0500 time 0.5726 (0.5872) data time 0.0008 (0.0024) model time 0.5718 (0.5869) loss 6.4613 (7.3532) grad_norm 2.9286 (inf) loss_scale 512.0000 (781.6655) mem 22339MB +[2024-07-25 11:02:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][290/625] eta 0:03:16 lr 0.000328 wd 0.0500 time 0.5752 (0.5868) data time 0.0006 (0.0023) model time 0.5746 (0.5864) loss 8.0988 (7.3619) grad_norm 2.6375 (inf) loss_scale 512.0000 (772.3986) mem 22339MB +[2024-07-25 11:03:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][300/625] eta 0:03:10 lr 0.000328 wd 0.0500 time 0.5732 (0.5864) data time 0.0006 (0.0023) model time 0.5726 (0.5859) loss 8.5651 (7.3547) grad_norm 1.8810 (inf) loss_scale 512.0000 (763.7475) mem 22339MB +[2024-07-25 11:03:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][310/625] eta 0:03:04 lr 0.000327 wd 0.0500 time 0.5664 (0.5860) data time 0.0006 (0.0022) model time 0.5658 (0.5854) loss 6.8208 (7.3320) grad_norm 1.8705 (inf) loss_scale 512.0000 (755.6527) mem 22339MB +[2024-07-25 11:03:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][320/625] eta 0:02:58 lr 0.000327 wd 0.0500 time 0.5738 (0.5857) data time 0.0008 (0.0022) model time 0.5729 (0.5851) loss 7.8862 (7.3438) grad_norm 1.7070 (inf) loss_scale 512.0000 (748.0623) mem 22339MB +[2024-07-25 11:03:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][330/625] eta 0:02:52 lr 0.000327 wd 0.0500 time 0.5738 (0.5853) data time 0.0006 (0.0021) model time 0.5732 (0.5846) loss 6.6896 (7.3488) grad_norm 2.8445 (inf) loss_scale 512.0000 (740.9305) mem 22339MB +[2024-07-25 11:03:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][340/625] eta 0:02:46 lr 0.000327 wd 0.0500 time 0.5744 (0.5850) data time 0.0006 (0.0021) model time 0.5738 (0.5843) loss 7.7074 (7.3693) grad_norm 6.0274 (inf) loss_scale 512.0000 (734.2170) mem 22339MB +[2024-07-25 11:03:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][350/625] eta 0:02:40 lr 0.000327 wd 0.0500 time 0.5724 (0.5847) data time 0.0008 (0.0021) model time 0.5716 (0.5839) loss 7.2699 (7.3675) grad_norm 3.3209 (inf) loss_scale 512.0000 (727.8860) mem 22339MB +[2024-07-25 11:03:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][360/625] eta 0:02:34 lr 0.000327 wd 0.0500 time 0.5706 (0.5845) data time 0.0006 (0.0020) model time 0.5699 (0.5836) loss 6.0970 (7.3594) grad_norm 2.0093 (inf) loss_scale 512.0000 (721.9058) mem 22339MB +[2024-07-25 11:03:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][370/625] eta 0:02:28 lr 0.000327 wd 0.0500 time 0.5719 (0.5842) data time 0.0006 (0.0020) model time 0.5713 (0.5833) loss 8.3416 (7.3552) grad_norm 4.2932 (inf) loss_scale 512.0000 (716.2480) mem 22339MB +[2024-07-25 11:03:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][380/625] eta 0:02:23 lr 0.000327 wd 0.0500 time 0.5716 (0.5839) data time 0.0006 (0.0020) model time 0.5710 (0.5830) loss 8.2617 (7.3522) grad_norm 2.0946 (inf) loss_scale 512.0000 (710.8871) mem 22339MB +[2024-07-25 11:03:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][390/625] eta 0:02:17 lr 0.000327 wd 0.0500 time 0.5699 (0.5840) data time 0.0008 (0.0019) model time 0.5691 (0.5831) loss 7.6065 (7.3559) grad_norm 2.6548 (inf) loss_scale 512.0000 (705.8005) mem 22339MB +[2024-07-25 11:03:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][400/625] eta 0:02:11 lr 0.000327 wd 0.0500 time 0.5668 (0.5845) data time 0.0008 (0.0019) model time 0.5660 (0.5836) loss 6.8269 (7.3586) grad_norm 2.0249 (inf) loss_scale 512.0000 (700.9676) mem 22339MB +[2024-07-25 11:04:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][410/625] eta 0:02:06 lr 0.000327 wd 0.0500 time 0.5730 (0.5861) data time 0.0006 (0.0019) model time 0.5723 (0.5855) loss 8.3459 (7.3529) grad_norm 2.7450 (inf) loss_scale 512.0000 (696.3698) mem 22339MB +[2024-07-25 11:04:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][420/625] eta 0:02:00 lr 0.000326 wd 0.0500 time 0.5661 (0.5882) data time 0.0006 (0.0019) model time 0.5654 (0.5879) loss 6.9257 (7.3606) grad_norm 1.8810 (inf) loss_scale 512.0000 (691.9905) mem 22339MB +[2024-07-25 11:04:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][430/625] eta 0:01:54 lr 0.000326 wd 0.0500 time 0.5722 (0.5879) data time 0.0008 (0.0018) model time 0.5714 (0.5875) loss 7.7062 (7.3651) grad_norm 1.8805 (inf) loss_scale 512.0000 (687.8144) mem 22339MB +[2024-07-25 11:04:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][440/625] eta 0:01:48 lr 0.000326 wd 0.0500 time 0.6624 (0.5883) data time 0.0008 (0.0018) model time 0.6616 (0.5879) loss 7.8109 (7.3544) grad_norm 2.9660 (inf) loss_scale 512.0000 (683.8277) mem 22339MB +[2024-07-25 11:04:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][450/625] eta 0:01:42 lr 0.000326 wd 0.0500 time 0.5753 (0.5880) data time 0.0006 (0.0018) model time 0.5747 (0.5875) loss 6.9941 (7.3464) grad_norm 1.6992 (inf) loss_scale 512.0000 (680.0177) mem 22339MB +[2024-07-25 11:04:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][460/625] eta 0:01:36 lr 0.000326 wd 0.0500 time 0.5701 (0.5877) data time 0.0008 (0.0018) model time 0.5693 (0.5872) loss 5.9552 (7.3402) grad_norm 3.2464 (inf) loss_scale 512.0000 (676.3731) mem 22339MB +[2024-07-25 11:04:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][470/625] eta 0:01:31 lr 0.000326 wd 0.0500 time 0.5724 (0.5874) data time 0.0006 (0.0018) model time 0.5718 (0.5869) loss 8.2706 (7.3375) grad_norm 2.1280 (inf) loss_scale 512.0000 (672.8832) mem 22339MB +[2024-07-25 11:04:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][480/625] eta 0:01:25 lr 0.000326 wd 0.0500 time 0.5717 (0.5871) data time 0.0007 (0.0017) model time 0.5710 (0.5865) loss 5.9178 (7.3342) grad_norm 4.4008 (inf) loss_scale 512.0000 (669.5385) mem 22339MB +[2024-07-25 11:04:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][490/625] eta 0:01:19 lr 0.000326 wd 0.0500 time 0.5764 (0.5868) data time 0.0007 (0.0017) model time 0.5757 (0.5862) loss 6.5663 (7.3250) grad_norm 4.4927 (inf) loss_scale 512.0000 (666.3299) mem 22339MB +[2024-07-25 11:04:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][500/625] eta 0:01:13 lr 0.000326 wd 0.0500 time 0.5670 (0.5866) data time 0.0009 (0.0017) model time 0.5661 (0.5859) loss 7.9210 (7.3310) grad_norm 2.5945 (inf) loss_scale 512.0000 (663.2495) mem 22339MB +[2024-07-25 11:05:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][510/625] eta 0:01:07 lr 0.000326 wd 0.0500 time 0.5735 (0.5863) data time 0.0007 (0.0017) model time 0.5728 (0.5857) loss 6.9971 (7.3332) grad_norm 1.8899 (inf) loss_scale 512.0000 (660.2896) mem 22339MB +[2024-07-25 11:05:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][520/625] eta 0:01:01 lr 0.000326 wd 0.0500 time 0.5736 (0.5861) data time 0.0006 (0.0017) model time 0.5730 (0.5854) loss 7.4118 (7.3291) grad_norm 2.6538 (inf) loss_scale 512.0000 (657.4434) mem 22339MB +[2024-07-25 11:05:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][530/625] eta 0:00:55 lr 0.000325 wd 0.0500 time 0.5710 (0.5859) data time 0.0008 (0.0017) model time 0.5702 (0.5852) loss 7.0876 (7.3336) grad_norm 2.1115 (inf) loss_scale 512.0000 (654.7043) mem 22339MB +[2024-07-25 11:05:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][540/625] eta 0:00:49 lr 0.000325 wd 0.0500 time 0.5714 (0.5857) data time 0.0006 (0.0017) model time 0.5708 (0.5850) loss 7.4246 (7.3319) grad_norm 2.9520 (inf) loss_scale 512.0000 (652.0665) mem 22339MB +[2024-07-25 11:05:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][550/625] eta 0:00:43 lr 0.000325 wd 0.0500 time 0.5726 (0.5855) data time 0.0007 (0.0016) model time 0.5718 (0.5847) loss 6.9715 (7.3324) grad_norm 1.9131 (inf) loss_scale 256.0000 (645.3430) mem 22339MB +[2024-07-25 11:05:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][560/625] eta 0:00:38 lr 0.000325 wd 0.0500 time 0.5683 (0.5853) data time 0.0008 (0.0016) model time 0.5675 (0.5845) loss 7.5126 (7.3300) grad_norm 2.1760 (inf) loss_scale 256.0000 (638.4029) mem 22339MB +[2024-07-25 11:05:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][570/625] eta 0:00:32 lr 0.000325 wd 0.0500 time 0.5729 (0.5851) data time 0.0008 (0.0016) model time 0.5722 (0.5843) loss 6.6501 (7.3252) grad_norm 2.3158 (inf) loss_scale 256.0000 (631.7058) mem 22339MB +[2024-07-25 11:05:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][580/625] eta 0:00:26 lr 0.000325 wd 0.0500 time 0.5698 (0.5849) data time 0.0009 (0.0016) model time 0.5690 (0.5841) loss 7.0458 (7.3186) grad_norm 2.4438 (inf) loss_scale 256.0000 (625.2392) mem 22339MB +[2024-07-25 11:05:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][590/625] eta 0:00:20 lr 0.000325 wd 0.0500 time 0.5736 (0.5847) data time 0.0009 (0.0016) model time 0.5727 (0.5839) loss 8.1548 (7.3152) grad_norm 5.9508 (inf) loss_scale 256.0000 (618.9915) mem 22339MB +[2024-07-25 11:05:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][600/625] eta 0:00:14 lr 0.000325 wd 0.0500 time 0.5740 (0.5846) data time 0.0006 (0.0016) model time 0.5734 (0.5837) loss 5.9791 (7.3116) grad_norm 3.4442 (inf) loss_scale 256.0000 (612.9517) mem 22339MB +[2024-07-25 11:06:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][610/625] eta 0:00:08 lr 0.000325 wd 0.0500 time 0.5726 (0.5848) data time 0.0006 (0.0016) model time 0.5720 (0.5839) loss 8.0616 (7.3079) grad_norm 3.2070 (inf) loss_scale 256.0000 (607.1097) mem 22339MB +[2024-07-25 11:06:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [203/300][620/625] eta 0:00:02 lr 0.000325 wd 0.0500 time 0.5679 (0.5849) data time 0.0005 (0.0016) model time 0.5674 (0.5840) loss 8.7646 (7.3150) grad_norm 2.1527 (inf) loss_scale 256.0000 (601.4557) mem 22339MB +[2024-07-25 11:06:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 203 training takes 0:06:05 +[2024-07-25 11:06:11 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 11:06:12 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 11:06:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.975 (0.975) Loss 0.5093 (0.5093) Acc@1 90.430 (90.430) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 11:06:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.203) Loss 0.7642 (0.6319) Acc@1 82.422 (87.194) Acc@5 97.021 (97.892) Mem 22339MB +[2024-07-25 11:06:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.166) Loss 0.8740 (0.7237) Acc@1 78.125 (84.345) Acc@5 96.045 (96.998) Mem 22339MB +[2024-07-25 11:06:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 83.993 Acc@5 97.003 +[2024-07-25 11:06:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.0% +[2024-07-25 11:06:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 1.206 (1.206) Loss 0.5034 (0.5034) Acc@1 90.381 (90.381) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 11:06:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.224) Loss 0.7524 (0.6219) Acc@1 83.447 (87.447) Acc@5 96.631 (97.998) Mem 22339MB +[2024-07-25 11:06:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.177) Loss 0.8682 (0.7136) Acc@1 79.102 (84.498) Acc@5 95.996 (97.140) Mem 22339MB +[2024-07-25 11:06:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.115 Acc@5 97.123 +[2024-07-25 11:06:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.1% +[2024-07-25 11:06:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][0/625] eta 0:17:53 lr 0.000325 wd 0.0500 time 1.7168 (1.7168) data time 0.7998 (0.7998) model time 0.0000 (0.0000) loss 6.8559 (6.8559) grad_norm 3.0515 (3.0515) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:06:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][10/625] eta 0:07:40 lr 0.000324 wd 0.0500 time 0.7650 (0.7490) data time 0.0008 (0.0735) model time 0.0000 (0.0000) loss 6.6630 (7.0957) grad_norm 2.9120 (2.8221) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:06:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][20/625] eta 0:06:52 lr 0.000324 wd 0.0500 time 0.5773 (0.6825) data time 0.0008 (0.0389) model time 0.0000 (0.0000) loss 7.7775 (7.3586) grad_norm 3.2766 (2.7186) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:06:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][30/625] eta 0:06:28 lr 0.000324 wd 0.0500 time 0.5718 (0.6536) data time 0.0009 (0.0266) model time 0.0000 (0.0000) loss 5.9324 (7.3272) grad_norm 2.2781 (2.7015) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:06:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][40/625] eta 0:06:13 lr 0.000324 wd 0.0500 time 0.5736 (0.6384) data time 0.0006 (0.0203) model time 0.0000 (0.0000) loss 8.4657 (7.2721) grad_norm 3.2252 (2.8517) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:06:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][50/625] eta 0:05:59 lr 0.000324 wd 0.0500 time 0.5751 (0.6257) data time 0.0006 (0.0165) model time 0.0000 (0.0000) loss 7.2406 (7.2220) grad_norm 2.8975 (2.9228) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:06:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][60/625] eta 0:05:48 lr 0.000324 wd 0.0500 time 0.5736 (0.6172) data time 0.0006 (0.0140) model time 0.5730 (0.5728) loss 7.5937 (7.2519) grad_norm 1.8241 (2.8433) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:07:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][70/625] eta 0:05:39 lr 0.000324 wd 0.0500 time 0.5801 (0.6114) data time 0.0008 (0.0121) model time 0.5793 (0.5740) loss 5.9987 (7.2299) grad_norm 2.4999 (2.7733) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:07:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][80/625] eta 0:05:30 lr 0.000324 wd 0.0500 time 0.5751 (0.6067) data time 0.0007 (0.0108) model time 0.5744 (0.5734) loss 5.8653 (7.1664) grad_norm 1.5074 (2.6971) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:07:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][90/625] eta 0:05:22 lr 0.000324 wd 0.0500 time 0.5783 (0.6031) data time 0.0009 (0.0097) model time 0.5773 (0.5734) loss 7.4811 (7.1315) grad_norm 1.5966 (2.6497) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:07:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][100/625] eta 0:05:15 lr 0.000324 wd 0.0500 time 0.5738 (0.6003) data time 0.0007 (0.0088) model time 0.5732 (0.5734) loss 6.1829 (7.1124) grad_norm 1.5403 (2.5942) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:07:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][110/625] eta 0:05:08 lr 0.000323 wd 0.0500 time 0.5739 (0.5981) data time 0.0008 (0.0081) model time 0.5731 (0.5737) loss 7.4982 (7.1409) grad_norm 3.0443 (2.5458) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:07:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][120/625] eta 0:05:01 lr 0.000323 wd 0.0500 time 0.5775 (0.5963) data time 0.0006 (0.0075) model time 0.5769 (0.5739) loss 5.9610 (7.1167) grad_norm 3.0704 (2.5863) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:07:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][130/625] eta 0:04:54 lr 0.000323 wd 0.0500 time 0.5780 (0.5946) data time 0.0008 (0.0070) model time 0.5772 (0.5738) loss 8.2307 (7.1491) grad_norm 1.8655 (2.5655) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:07:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][140/625] eta 0:04:47 lr 0.000323 wd 0.0500 time 0.5785 (0.5933) data time 0.0007 (0.0065) model time 0.5778 (0.5741) loss 7.7358 (7.1563) grad_norm 1.7655 (2.5234) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:07:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][150/625] eta 0:04:41 lr 0.000323 wd 0.0500 time 0.5684 (0.5921) data time 0.0009 (0.0062) model time 0.5675 (0.5741) loss 7.2662 (7.1666) grad_norm 1.6197 (2.4975) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:07:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][160/625] eta 0:04:34 lr 0.000323 wd 0.0500 time 0.5747 (0.5910) data time 0.0006 (0.0058) model time 0.5742 (0.5741) loss 8.2970 (7.1844) grad_norm 3.1225 (2.4714) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:08:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][170/625] eta 0:04:28 lr 0.000323 wd 0.0500 time 0.5759 (0.5901) data time 0.0006 (0.0055) model time 0.5753 (0.5740) loss 5.8538 (7.1704) grad_norm 1.6941 (2.4430) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:08:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][180/625] eta 0:04:22 lr 0.000323 wd 0.0500 time 0.5753 (0.5892) data time 0.0008 (0.0053) model time 0.5745 (0.5741) loss 7.0657 (7.1709) grad_norm 1.7856 (2.4292) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:08:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][190/625] eta 0:04:16 lr 0.000323 wd 0.0500 time 0.5773 (0.5893) data time 0.0008 (0.0050) model time 0.5765 (0.5752) loss 7.3558 (7.1906) grad_norm 2.4778 (2.4205) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:08:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][200/625] eta 0:04:10 lr 0.000323 wd 0.0500 time 0.5753 (0.5900) data time 0.0006 (0.0048) model time 0.5747 (0.5770) loss 7.5798 (7.2127) grad_norm 2.2026 (2.4207) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:08:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][210/625] eta 0:04:04 lr 0.000323 wd 0.0500 time 0.5728 (0.5902) data time 0.0008 (0.0046) model time 0.5719 (0.5780) loss 8.1875 (7.2203) grad_norm 1.4926 (2.4269) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:08:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][220/625] eta 0:03:59 lr 0.000322 wd 0.0500 time 0.5727 (0.5910) data time 0.0006 (0.0045) model time 0.5721 (0.5798) loss 6.1864 (7.2251) grad_norm 1.6432 (2.4255) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:08:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][230/625] eta 0:03:54 lr 0.000322 wd 0.0500 time 0.7241 (0.5940) data time 0.0008 (0.0043) model time 0.7233 (0.5841) loss 8.3551 (7.2311) grad_norm 1.8002 (2.4136) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:08:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][240/625] eta 0:03:49 lr 0.000322 wd 0.0500 time 0.5773 (0.5956) data time 0.0007 (0.0042) model time 0.5766 (0.5866) loss 6.0156 (7.2340) grad_norm 1.9715 (2.4010) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:08:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][250/625] eta 0:03:43 lr 0.000322 wd 0.0500 time 0.5728 (0.5948) data time 0.0008 (0.0040) model time 0.5720 (0.5860) loss 8.3138 (7.2347) grad_norm 1.8573 (2.4047) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:08:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][260/625] eta 0:03:37 lr 0.000322 wd 0.0500 time 0.5744 (0.5953) data time 0.0006 (0.0039) model time 0.5738 (0.5871) loss 7.7495 (7.2240) grad_norm 3.0332 (2.3957) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:09:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][270/625] eta 0:03:31 lr 0.000322 wd 0.0500 time 0.5728 (0.5945) data time 0.0009 (0.0038) model time 0.5719 (0.5864) loss 6.0675 (7.2097) grad_norm 2.4806 (2.3792) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:09:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][280/625] eta 0:03:24 lr 0.000322 wd 0.0500 time 0.5723 (0.5938) data time 0.0006 (0.0037) model time 0.5716 (0.5858) loss 7.9259 (7.2141) grad_norm 5.8777 (2.3852) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:09:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][290/625] eta 0:03:18 lr 0.000322 wd 0.0500 time 0.5767 (0.5931) data time 0.0006 (0.0036) model time 0.5761 (0.5853) loss 6.5122 (7.2033) grad_norm 2.1914 (2.3930) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:09:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][300/625] eta 0:03:12 lr 0.000322 wd 0.0500 time 0.5746 (0.5925) data time 0.0008 (0.0035) model time 0.5738 (0.5848) loss 7.0843 (7.2064) grad_norm 3.3079 (2.4002) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:09:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][310/625] eta 0:03:06 lr 0.000322 wd 0.0500 time 0.5755 (0.5919) data time 0.0006 (0.0034) model time 0.5748 (0.5844) loss 6.9831 (7.2135) grad_norm 3.0656 (2.4122) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:09:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][320/625] eta 0:03:00 lr 0.000322 wd 0.0500 time 0.5721 (0.5913) data time 0.0007 (0.0034) model time 0.5713 (0.5839) loss 7.9942 (7.2085) grad_norm 1.9588 (2.4055) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:09:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][330/625] eta 0:02:54 lr 0.000321 wd 0.0500 time 0.5733 (0.5908) data time 0.0007 (0.0033) model time 0.5725 (0.5835) loss 7.1475 (7.2100) grad_norm 2.7003 (2.4172) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:09:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][340/625] eta 0:02:48 lr 0.000321 wd 0.0500 time 0.5717 (0.5903) data time 0.0007 (0.0032) model time 0.5710 (0.5832) loss 7.9230 (7.2210) grad_norm 2.6319 (2.4286) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:09:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][350/625] eta 0:02:42 lr 0.000321 wd 0.0500 time 0.5773 (0.5898) data time 0.0009 (0.0031) model time 0.5764 (0.5828) loss 5.6341 (7.2127) grad_norm 2.4227 (2.4280) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:09:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][360/625] eta 0:02:36 lr 0.000321 wd 0.0500 time 0.5737 (0.5894) data time 0.0006 (0.0031) model time 0.5732 (0.5825) loss 6.2226 (7.2182) grad_norm 5.8175 (2.4496) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:09:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][370/625] eta 0:02:30 lr 0.000321 wd 0.0500 time 0.5755 (0.5889) data time 0.0006 (0.0030) model time 0.5749 (0.5822) loss 7.6041 (7.2261) grad_norm 3.2853 (2.4573) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:10:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][380/625] eta 0:02:24 lr 0.000321 wd 0.0500 time 0.5742 (0.5885) data time 0.0006 (0.0030) model time 0.5736 (0.5819) loss 6.4597 (7.2368) grad_norm 2.1529 (2.4615) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:10:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][390/625] eta 0:02:18 lr 0.000321 wd 0.0500 time 0.5779 (0.5882) data time 0.0008 (0.0029) model time 0.5771 (0.5817) loss 6.5082 (7.2387) grad_norm 1.7547 (2.4613) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:10:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][400/625] eta 0:02:12 lr 0.000321 wd 0.0500 time 0.5737 (0.5878) data time 0.0006 (0.0029) model time 0.5731 (0.5814) loss 6.6341 (7.2322) grad_norm 3.0427 (2.4598) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:10:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][410/625] eta 0:02:06 lr 0.000321 wd 0.0500 time 0.5799 (0.5875) data time 0.0008 (0.0028) model time 0.5791 (0.5812) loss 7.5345 (7.2299) grad_norm 2.4409 (2.4583) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:10:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][420/625] eta 0:02:00 lr 0.000321 wd 0.0500 time 0.5787 (0.5879) data time 0.0006 (0.0028) model time 0.5781 (0.5819) loss 6.9166 (7.2337) grad_norm 1.5018 (2.4541) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:10:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][430/625] eta 0:01:54 lr 0.000320 wd 0.0500 time 0.5760 (0.5879) data time 0.0006 (0.0027) model time 0.5754 (0.5820) loss 5.9728 (7.2270) grad_norm 2.2401 (2.4607) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:10:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][440/625] eta 0:01:48 lr 0.000320 wd 0.0500 time 0.7647 (0.5885) data time 0.0008 (0.0027) model time 0.7639 (0.5827) loss 8.5238 (7.2372) grad_norm 1.7844 (2.4677) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:10:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][450/625] eta 0:01:43 lr 0.000320 wd 0.0500 time 0.7528 (0.5896) data time 0.0006 (0.0026) model time 0.7521 (0.5841) loss 7.6777 (7.2426) grad_norm 2.0202 (2.4631) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:10:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][460/625] eta 0:01:37 lr 0.000320 wd 0.0500 time 0.5734 (0.5901) data time 0.0008 (0.0026) model time 0.5727 (0.5848) loss 7.1948 (7.2461) grad_norm 1.7517 (2.4569) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:10:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][470/625] eta 0:01:31 lr 0.000320 wd 0.0500 time 0.7464 (0.5902) data time 0.0008 (0.0026) model time 0.7457 (0.5850) loss 7.1588 (7.2449) grad_norm 2.7359 (2.4563) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:11:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][480/625] eta 0:01:25 lr 0.000320 wd 0.0500 time 0.5753 (0.5905) data time 0.0008 (0.0025) model time 0.5745 (0.5854) loss 7.9323 (7.2452) grad_norm 1.8218 (2.4507) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:11:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][490/625] eta 0:01:19 lr 0.000320 wd 0.0500 time 0.5779 (0.5902) data time 0.0005 (0.0025) model time 0.5774 (0.5852) loss 5.8629 (7.2480) grad_norm 1.9611 (2.4440) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:11:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][500/625] eta 0:01:13 lr 0.000320 wd 0.0500 time 0.5775 (0.5898) data time 0.0008 (0.0025) model time 0.5767 (0.5849) loss 8.2912 (7.2552) grad_norm 2.9396 (2.4374) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:11:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][510/625] eta 0:01:07 lr 0.000320 wd 0.0500 time 0.5752 (0.5895) data time 0.0006 (0.0024) model time 0.5746 (0.5846) loss 5.7097 (7.2596) grad_norm 1.8946 (2.4297) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:11:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][520/625] eta 0:01:01 lr 0.000320 wd 0.0500 time 0.5833 (0.5892) data time 0.0006 (0.0024) model time 0.5827 (0.5844) loss 7.9431 (7.2659) grad_norm 2.1301 (2.4419) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:11:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][530/625] eta 0:00:55 lr 0.000320 wd 0.0500 time 0.5759 (0.5890) data time 0.0008 (0.0024) model time 0.5751 (0.5842) loss 5.8656 (7.2645) grad_norm 2.4162 (2.4419) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:11:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][540/625] eta 0:00:50 lr 0.000319 wd 0.0500 time 0.5760 (0.5887) data time 0.0009 (0.0023) model time 0.5752 (0.5840) loss 5.8798 (7.2604) grad_norm 2.3286 (2.4394) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:11:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][550/625] eta 0:00:44 lr 0.000319 wd 0.0500 time 0.5767 (0.5884) data time 0.0008 (0.0023) model time 0.5759 (0.5837) loss 5.8957 (7.2580) grad_norm 6.9755 (2.4472) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:11:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][560/625] eta 0:00:38 lr 0.000319 wd 0.0500 time 0.5771 (0.5882) data time 0.0007 (0.0023) model time 0.5765 (0.5835) loss 8.3311 (7.2612) grad_norm 2.1225 (2.4503) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:11:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][570/625] eta 0:00:32 lr 0.000319 wd 0.0500 time 0.5746 (0.5879) data time 0.0008 (0.0023) model time 0.5738 (0.5833) loss 5.7954 (7.2547) grad_norm 2.7947 (2.4473) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:12:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][580/625] eta 0:00:26 lr 0.000319 wd 0.0500 time 0.5879 (0.5877) data time 0.0008 (0.0023) model time 0.5871 (0.5832) loss 8.9052 (7.2598) grad_norm 1.9988 (2.4512) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:12:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][590/625] eta 0:00:20 lr 0.000319 wd 0.0500 time 0.5735 (0.5875) data time 0.0008 (0.0022) model time 0.5727 (0.5830) loss 6.0847 (7.2626) grad_norm 2.2357 (2.4525) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:12:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][600/625] eta 0:00:14 lr 0.000319 wd 0.0500 time 0.5757 (0.5873) data time 0.0009 (0.0022) model time 0.5748 (0.5828) loss 7.5888 (7.2679) grad_norm 2.5030 (2.4521) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:12:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][610/625] eta 0:00:08 lr 0.000319 wd 0.0500 time 0.5745 (0.5871) data time 0.0004 (0.0022) model time 0.5742 (0.5827) loss 6.0129 (7.2688) grad_norm 2.5127 (2.4591) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:12:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [204/300][620/625] eta 0:00:02 lr 0.000319 wd 0.0500 time 0.5847 (0.5869) data time 0.0005 (0.0022) model time 0.5841 (0.5825) loss 7.8362 (7.2752) grad_norm 2.6499 (2.4567) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:12:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 204 training takes 0:06:06 +[2024-07-25 11:12:27 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 11:12:29 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 11:12:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.484 (0.484) Loss 0.5054 (0.5054) Acc@1 90.039 (90.039) Acc@5 98.828 (98.828) Mem 22339MB +[2024-07-25 11:12:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.7563 (0.6227) Acc@1 82.227 (87.349) Acc@5 96.631 (97.985) Mem 22339MB +[2024-07-25 11:12:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.8877 (0.7203) Acc@1 78.516 (84.435) Acc@5 95.508 (97.026) Mem 22339MB +[2024-07-25 11:12:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.123 Acc@5 97.007 +[2024-07-25 11:12:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.1% +[2024-07-25 11:12:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 84.12% +[2024-07-25 11:12:32 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 11:12:34 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 11:12:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.791 (0.791) Loss 0.5029 (0.5029) Acc@1 90.381 (90.381) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 11:12:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.186) Loss 0.7524 (0.6217) Acc@1 83.350 (87.456) Acc@5 96.631 (97.989) Mem 22339MB +[2024-07-25 11:12:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.157) Loss 0.8677 (0.7132) Acc@1 79.102 (84.521) Acc@5 96.045 (97.133) Mem 22339MB +[2024-07-25 11:12:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.145 Acc@5 97.117 +[2024-07-25 11:12:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.1% +[2024-07-25 11:12:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.15% +[2024-07-25 11:12:37 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 11:12:39 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 11:12:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][0/625] eta 0:10:23 lr 0.000319 wd 0.0500 time 0.9977 (0.9977) data time 0.4796 (0.4796) model time 0.0000 (0.0000) loss 7.1083 (7.1083) grad_norm 2.8771 (2.8771) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:12:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][10/625] eta 0:06:19 lr 0.000319 wd 0.0500 time 0.6499 (0.6178) data time 0.0006 (0.0444) model time 0.0000 (0.0000) loss 6.0677 (6.9200) grad_norm 2.2961 (2.3740) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:12:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][20/625] eta 0:06:02 lr 0.000318 wd 0.0500 time 0.5715 (0.5989) data time 0.0008 (0.0237) model time 0.0000 (0.0000) loss 7.8408 (7.0288) grad_norm 1.8830 (2.2276) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:12:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][30/625] eta 0:05:55 lr 0.000318 wd 0.0500 time 0.5698 (0.5982) data time 0.0007 (0.0163) model time 0.0000 (0.0000) loss 6.8167 (7.0254) grad_norm 3.4108 (2.2347) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:13:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][40/625] eta 0:05:53 lr 0.000318 wd 0.0500 time 0.7237 (0.6036) data time 0.0006 (0.0126) model time 0.0000 (0.0000) loss 8.3382 (7.1319) grad_norm 2.4721 (2.2545) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:13:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][50/625] eta 0:05:53 lr 0.000318 wd 0.0500 time 0.7519 (0.6149) data time 0.0007 (0.0104) model time 0.0000 (0.0000) loss 8.4285 (7.1212) grad_norm 1.9532 (2.2367) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:13:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][60/625] eta 0:05:45 lr 0.000318 wd 0.0500 time 0.5684 (0.6107) data time 0.0007 (0.0090) model time 0.5677 (0.5880) loss 7.8165 (7.1377) grad_norm 2.8425 (2.2601) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:13:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][70/625] eta 0:05:38 lr 0.000318 wd 0.0500 time 0.5660 (0.6101) data time 0.0007 (0.0078) model time 0.5653 (0.5966) loss 6.2849 (7.1441) grad_norm 1.9359 (2.2267) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:13:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][80/625] eta 0:05:30 lr 0.000318 wd 0.0500 time 0.5718 (0.6060) data time 0.0006 (0.0070) model time 0.5712 (0.5897) loss 8.1713 (7.1847) grad_norm 2.1576 (2.1970) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:13:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][90/625] eta 0:05:22 lr 0.000318 wd 0.0500 time 0.5720 (0.6024) data time 0.0008 (0.0063) model time 0.5713 (0.5855) loss 6.2975 (7.1876) grad_norm 1.6612 (2.1794) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:13:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][100/625] eta 0:05:14 lr 0.000318 wd 0.0500 time 0.5695 (0.5996) data time 0.0008 (0.0058) model time 0.5687 (0.5830) loss 7.9236 (7.2220) grad_norm 2.2234 (2.1600) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:13:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][110/625] eta 0:05:07 lr 0.000318 wd 0.0500 time 0.5701 (0.5973) data time 0.0007 (0.0053) model time 0.5694 (0.5814) loss 7.5010 (7.2363) grad_norm 2.1663 (2.1445) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:13:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][120/625] eta 0:05:00 lr 0.000318 wd 0.0500 time 0.5714 (0.5955) data time 0.0008 (0.0049) model time 0.5706 (0.5805) loss 8.8634 (7.2508) grad_norm 1.7964 (2.1405) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:13:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][130/625] eta 0:04:54 lr 0.000317 wd 0.0500 time 0.5731 (0.5940) data time 0.0006 (0.0046) model time 0.5725 (0.5796) loss 6.9826 (7.2566) grad_norm 1.8847 (2.1432) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:14:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][140/625] eta 0:04:47 lr 0.000317 wd 0.0500 time 0.5724 (0.5936) data time 0.0007 (0.0044) model time 0.5717 (0.5806) loss 7.9523 (7.2907) grad_norm 2.5868 (2.1992) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:14:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][150/625] eta 0:04:41 lr 0.000317 wd 0.0500 time 0.5729 (0.5924) data time 0.0008 (0.0041) model time 0.5721 (0.5801) loss 7.4560 (7.2894) grad_norm 2.3158 (2.2055) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:14:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][160/625] eta 0:04:35 lr 0.000317 wd 0.0500 time 0.5727 (0.5914) data time 0.0007 (0.0039) model time 0.5720 (0.5796) loss 6.2784 (7.2852) grad_norm 2.5437 (2.2264) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:14:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][170/625] eta 0:04:28 lr 0.000317 wd 0.0500 time 0.5729 (0.5905) data time 0.0008 (0.0038) model time 0.5721 (0.5792) loss 7.0846 (7.2699) grad_norm 2.0136 (2.2252) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:14:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][180/625] eta 0:04:22 lr 0.000317 wd 0.0500 time 0.5714 (0.5896) data time 0.0006 (0.0036) model time 0.5708 (0.5787) loss 8.0476 (7.2721) grad_norm 2.5451 (2.2111) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:14:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][190/625] eta 0:04:16 lr 0.000317 wd 0.0500 time 0.5718 (0.5888) data time 0.0008 (0.0034) model time 0.5710 (0.5783) loss 8.1915 (7.2831) grad_norm 1.9610 (2.2076) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:14:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][200/625] eta 0:04:09 lr 0.000317 wd 0.0500 time 0.5729 (0.5881) data time 0.0006 (0.0033) model time 0.5723 (0.5781) loss 6.3010 (7.2787) grad_norm 1.8668 (2.1944) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:14:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][210/625] eta 0:04:03 lr 0.000317 wd 0.0500 time 0.5656 (0.5875) data time 0.0006 (0.0032) model time 0.5650 (0.5779) loss 7.8654 (7.3056) grad_norm 1.7246 (2.1902) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:14:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][220/625] eta 0:03:57 lr 0.000317 wd 0.0500 time 0.5696 (0.5869) data time 0.0006 (0.0031) model time 0.5689 (0.5776) loss 7.0042 (7.3006) grad_norm 1.9760 (2.2039) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:14:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][230/625] eta 0:03:51 lr 0.000317 wd 0.0500 time 0.5707 (0.5864) data time 0.0009 (0.0030) model time 0.5698 (0.5774) loss 7.9050 (7.2974) grad_norm 1.7296 (2.2044) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:15:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][240/625] eta 0:03:45 lr 0.000316 wd 0.0500 time 0.7394 (0.5868) data time 0.0007 (0.0029) model time 0.7387 (0.5784) loss 7.4968 (7.3046) grad_norm 5.4367 (2.2229) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:15:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][250/625] eta 0:03:39 lr 0.000316 wd 0.0500 time 0.5796 (0.5866) data time 0.0006 (0.0028) model time 0.5790 (0.5785) loss 6.9275 (7.2994) grad_norm 2.4919 (2.2523) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:15:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][260/625] eta 0:03:34 lr 0.000316 wd 0.0500 time 0.7449 (0.5889) data time 0.0008 (0.0028) model time 0.7441 (0.5817) loss 8.4853 (7.3055) grad_norm 1.8766 (2.2618) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:15:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][270/625] eta 0:03:29 lr 0.000316 wd 0.0500 time 0.5683 (0.5901) data time 0.0008 (0.0027) model time 0.5675 (0.5835) loss 8.3751 (7.2953) grad_norm 2.2147 (2.2702) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:15:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][280/625] eta 0:03:23 lr 0.000316 wd 0.0500 time 0.5726 (0.5902) data time 0.0009 (0.0026) model time 0.5717 (0.5838) loss 8.6071 (7.2978) grad_norm 2.2014 (2.2733) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:15:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][290/625] eta 0:03:17 lr 0.000316 wd 0.0500 time 0.5749 (0.5902) data time 0.0008 (0.0026) model time 0.5742 (0.5841) loss 6.4538 (7.2826) grad_norm 2.0111 (2.2659) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:15:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][300/625] eta 0:03:11 lr 0.000316 wd 0.0500 time 0.5720 (0.5902) data time 0.0006 (0.0025) model time 0.5713 (0.5842) loss 6.0666 (7.2854) grad_norm 1.7135 (2.2649) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:15:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][310/625] eta 0:03:05 lr 0.000316 wd 0.0500 time 0.5711 (0.5897) data time 0.0006 (0.0025) model time 0.5705 (0.5838) loss 7.1507 (7.2718) grad_norm 2.4854 (2.2620) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:15:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][320/625] eta 0:02:59 lr 0.000316 wd 0.0500 time 0.5715 (0.5892) data time 0.0006 (0.0024) model time 0.5709 (0.5834) loss 8.2049 (7.2720) grad_norm 2.3547 (2.2607) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:15:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][330/625] eta 0:02:53 lr 0.000316 wd 0.0500 time 0.5736 (0.5888) data time 0.0006 (0.0024) model time 0.5730 (0.5831) loss 7.2352 (7.2629) grad_norm 1.9493 (2.2502) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:15:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][340/625] eta 0:02:47 lr 0.000316 wd 0.0500 time 0.5736 (0.5884) data time 0.0007 (0.0023) model time 0.5729 (0.5828) loss 6.6862 (7.2782) grad_norm 1.9346 (2.2555) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:16:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][350/625] eta 0:02:41 lr 0.000315 wd 0.0500 time 0.5596 (0.5881) data time 0.0007 (0.0023) model time 0.5589 (0.5826) loss 7.5367 (7.2765) grad_norm 3.3151 (2.2718) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:16:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][360/625] eta 0:02:35 lr 0.000315 wd 0.0500 time 0.5711 (0.5881) data time 0.0006 (0.0022) model time 0.5705 (0.5827) loss 5.7858 (7.2781) grad_norm 2.4927 (2.2747) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:16:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][370/625] eta 0:02:29 lr 0.000315 wd 0.0500 time 0.5709 (0.5877) data time 0.0007 (0.0022) model time 0.5702 (0.5825) loss 6.7454 (7.2701) grad_norm 2.4679 (2.2717) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:16:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][380/625] eta 0:02:23 lr 0.000315 wd 0.0500 time 0.5699 (0.5873) data time 0.0007 (0.0022) model time 0.5692 (0.5822) loss 8.0157 (7.2675) grad_norm 5.3286 (2.2885) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:16:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][390/625] eta 0:02:17 lr 0.000315 wd 0.0500 time 0.5661 (0.5870) data time 0.0008 (0.0021) model time 0.5653 (0.5819) loss 7.2535 (7.2666) grad_norm 2.4618 (2.2966) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:16:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][400/625] eta 0:02:12 lr 0.000315 wd 0.0500 time 0.5693 (0.5867) data time 0.0007 (0.0021) model time 0.5686 (0.5817) loss 7.9409 (7.2801) grad_norm 2.6234 (2.3091) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:16:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][410/625] eta 0:02:06 lr 0.000315 wd 0.0500 time 0.5723 (0.5864) data time 0.0006 (0.0021) model time 0.5717 (0.5815) loss 8.2532 (7.2840) grad_norm 3.6497 (2.3316) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:16:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][420/625] eta 0:02:00 lr 0.000315 wd 0.0500 time 0.5691 (0.5861) data time 0.0006 (0.0020) model time 0.5686 (0.5813) loss 7.1022 (7.2752) grad_norm 2.5676 (2.3314) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:16:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][430/625] eta 0:01:54 lr 0.000315 wd 0.0500 time 0.5676 (0.5859) data time 0.0006 (0.0020) model time 0.5670 (0.5811) loss 6.0013 (7.2805) grad_norm 2.1357 (2.3325) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:16:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][440/625] eta 0:01:48 lr 0.000315 wd 0.0500 time 0.5710 (0.5856) data time 0.0007 (0.0020) model time 0.5703 (0.5809) loss 7.5268 (7.2801) grad_norm 3.1148 (2.3396) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:17:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][450/625] eta 0:01:42 lr 0.000314 wd 0.0500 time 0.5728 (0.5854) data time 0.0008 (0.0020) model time 0.5720 (0.5807) loss 6.7686 (7.2794) grad_norm 1.8111 (2.3400) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:17:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][460/625] eta 0:01:36 lr 0.000314 wd 0.0500 time 0.5725 (0.5855) data time 0.0006 (0.0019) model time 0.5720 (0.5810) loss 9.1582 (7.2819) grad_norm 2.0165 (2.3332) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:17:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][470/625] eta 0:01:30 lr 0.000314 wd 0.0500 time 0.5708 (0.5858) data time 0.0006 (0.0019) model time 0.5702 (0.5814) loss 7.6794 (7.2791) grad_norm 3.3824 (2.3762) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:17:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][480/625] eta 0:01:25 lr 0.000314 wd 0.0500 time 0.7264 (0.5869) data time 0.0006 (0.0019) model time 0.7258 (0.5827) loss 6.0800 (7.2689) grad_norm 3.7121 (2.3952) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:17:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][490/625] eta 0:01:19 lr 0.000314 wd 0.0500 time 0.7469 (0.5883) data time 0.0007 (0.0019) model time 0.7463 (0.5843) loss 6.5672 (7.2630) grad_norm 2.1564 (2.3898) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:17:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][500/625] eta 0:01:13 lr 0.000314 wd 0.0500 time 0.5767 (0.5883) data time 0.0009 (0.0019) model time 0.5758 (0.5844) loss 7.5857 (7.2673) grad_norm 2.3146 (2.3908) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:17:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][510/625] eta 0:01:07 lr 0.000314 wd 0.0500 time 0.5919 (0.5883) data time 0.0007 (0.0018) model time 0.5912 (0.5845) loss 9.1068 (7.2749) grad_norm 2.0037 (2.3917) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:17:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][520/625] eta 0:01:01 lr 0.000314 wd 0.0500 time 0.5756 (0.5884) data time 0.0006 (0.0018) model time 0.5750 (0.5847) loss 6.8688 (7.2809) grad_norm 1.8394 (2.4042) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:17:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][530/625] eta 0:00:55 lr 0.000314 wd 0.0500 time 0.5732 (0.5882) data time 0.0007 (0.0018) model time 0.5726 (0.5845) loss 6.8201 (7.2791) grad_norm 3.5056 (2.3994) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:17:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][540/625] eta 0:00:49 lr 0.000314 wd 0.0500 time 0.5730 (0.5879) data time 0.0008 (0.0018) model time 0.5722 (0.5842) loss 9.0904 (7.2790) grad_norm 2.4160 (2.3984) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:18:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][550/625] eta 0:00:44 lr 0.000314 wd 0.0500 time 0.5746 (0.5877) data time 0.0006 (0.0018) model time 0.5740 (0.5841) loss 7.8582 (7.2792) grad_norm 2.7330 (2.4131) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:18:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][560/625] eta 0:00:38 lr 0.000313 wd 0.0500 time 0.5704 (0.5875) data time 0.0008 (0.0017) model time 0.5695 (0.5839) loss 7.5885 (7.2780) grad_norm 1.6679 (2.4147) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:18:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][570/625] eta 0:00:32 lr 0.000313 wd 0.0500 time 0.5740 (0.5873) data time 0.0006 (0.0017) model time 0.5734 (0.5837) loss 7.2799 (7.2746) grad_norm 3.4606 (2.4124) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:18:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][580/625] eta 0:00:26 lr 0.000313 wd 0.0500 time 0.5720 (0.5873) data time 0.0006 (0.0017) model time 0.5714 (0.5838) loss 6.7496 (7.2830) grad_norm 3.3295 (2.4129) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:18:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][590/625] eta 0:00:20 lr 0.000313 wd 0.0500 time 0.5683 (0.5871) data time 0.0006 (0.0017) model time 0.5677 (0.5836) loss 7.9265 (7.2839) grad_norm 3.4821 (2.4099) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:18:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][600/625] eta 0:00:14 lr 0.000313 wd 0.0500 time 0.5702 (0.5869) data time 0.0006 (0.0017) model time 0.5696 (0.5834) loss 6.7394 (7.2819) grad_norm 1.9106 (2.4043) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:18:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][610/625] eta 0:00:08 lr 0.000313 wd 0.0500 time 0.5646 (0.5867) data time 0.0006 (0.0017) model time 0.5640 (0.5832) loss 7.4738 (7.2758) grad_norm 3.8878 (2.4068) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:18:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [205/300][620/625] eta 0:00:02 lr 0.000313 wd 0.0500 time 0.5704 (0.5865) data time 0.0006 (0.0017) model time 0.5698 (0.5831) loss 7.9229 (7.2715) grad_norm 2.4556 (2.4052) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:18:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 205 training takes 0:06:06 +[2024-07-25 11:18:45 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 11:18:47 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 11:18:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.483 (0.483) Loss 0.5039 (0.5039) Acc@1 90.234 (90.234) Acc@5 98.828 (98.828) Mem 22339MB +[2024-07-25 11:18:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.7603 (0.6203) Acc@1 83.203 (87.367) Acc@5 96.729 (97.963) Mem 22339MB +[2024-07-25 11:18:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.8506 (0.7154) Acc@1 79.150 (84.482) Acc@5 95.850 (97.040) Mem 22339MB +[2024-07-25 11:18:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.107 Acc@5 97.035 +[2024-07-25 11:18:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.1% +[2024-07-25 11:18:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.883 (0.883) Loss 0.5034 (0.5034) Acc@1 90.430 (90.430) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 11:18:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.195) Loss 0.7510 (0.6215) Acc@1 83.252 (87.473) Acc@5 96.631 (97.985) Mem 22339MB +[2024-07-25 11:18:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.162) Loss 0.8677 (0.7128) Acc@1 79.053 (84.538) Acc@5 96.094 (97.131) Mem 22339MB +[2024-07-25 11:18:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.163 Acc@5 97.111 +[2024-07-25 11:18:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.2% +[2024-07-25 11:18:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.16% +[2024-07-25 11:18:54 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 11:18:56 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 11:18:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][0/625] eta 0:09:27 lr 0.000313 wd 0.0500 time 0.9074 (0.9074) data time 0.3878 (0.3878) model time 0.0000 (0.0000) loss 6.8452 (6.8452) grad_norm 1.8908 (1.8908) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:19:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][10/625] eta 0:06:11 lr 0.000313 wd 0.0500 time 0.5697 (0.6037) data time 0.0008 (0.0360) model time 0.0000 (0.0000) loss 8.6041 (7.4190) grad_norm 2.7256 (2.5453) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:19:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][20/625] eta 0:05:56 lr 0.000313 wd 0.0500 time 0.5708 (0.5898) data time 0.0008 (0.0193) model time 0.0000 (0.0000) loss 6.2983 (7.1182) grad_norm 1.6428 (2.3562) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:19:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][30/625] eta 0:05:48 lr 0.000313 wd 0.0500 time 0.5386 (0.5858) data time 0.0006 (0.0134) model time 0.0000 (0.0000) loss 7.9890 (7.0730) grad_norm 2.7078 (2.3954) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:19:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][40/625] eta 0:05:41 lr 0.000312 wd 0.0500 time 0.5737 (0.5829) data time 0.0008 (0.0104) model time 0.0000 (0.0000) loss 8.5331 (7.0828) grad_norm 1.5950 (2.3863) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:19:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][50/625] eta 0:05:36 lr 0.000312 wd 0.0500 time 0.7232 (0.5844) data time 0.0008 (0.0085) model time 0.0000 (0.0000) loss 7.0638 (7.1188) grad_norm 1.9863 (2.3786) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:19:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][60/625] eta 0:05:32 lr 0.000312 wd 0.0500 time 0.6275 (0.5876) data time 0.0006 (0.0073) model time 0.6270 (0.6033) loss 6.8699 (7.1254) grad_norm 1.6653 (2.3782) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:19:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][70/625] eta 0:05:27 lr 0.000312 wd 0.0500 time 0.6077 (0.5894) data time 0.0008 (0.0064) model time 0.6069 (0.6012) loss 6.3707 (7.0691) grad_norm 2.0956 (2.3189) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:19:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][80/625] eta 0:05:23 lr 0.000312 wd 0.0500 time 0.5749 (0.5932) data time 0.0007 (0.0057) model time 0.5742 (0.6072) loss 9.0091 (7.0868) grad_norm 5.5649 (2.3533) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:19:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][90/625] eta 0:05:20 lr 0.000312 wd 0.0500 time 0.7325 (0.5988) data time 0.0006 (0.0051) model time 0.7319 (0.6164) loss 8.0617 (7.0954) grad_norm 1.9533 (2.4031) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:19:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][100/625] eta 0:05:13 lr 0.000312 wd 0.0500 time 0.5681 (0.5976) data time 0.0007 (0.0047) model time 0.5673 (0.6102) loss 8.4012 (7.0873) grad_norm 4.1198 (2.4124) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:20:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][110/625] eta 0:05:09 lr 0.000312 wd 0.0500 time 0.5711 (0.6003) data time 0.0006 (0.0044) model time 0.5705 (0.6129) loss 8.1520 (7.0928) grad_norm 3.3409 (2.4072) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:20:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][120/625] eta 0:05:02 lr 0.000312 wd 0.0500 time 0.5713 (0.5990) data time 0.0006 (0.0041) model time 0.5707 (0.6088) loss 7.6153 (7.0925) grad_norm 1.8450 (2.4599) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:20:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][130/625] eta 0:04:55 lr 0.000312 wd 0.0500 time 0.5717 (0.5971) data time 0.0007 (0.0038) model time 0.5710 (0.6043) loss 7.0608 (7.0998) grad_norm 1.7369 (2.4316) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:20:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][140/625] eta 0:04:48 lr 0.000312 wd 0.0500 time 0.5727 (0.5954) data time 0.0007 (0.0036) model time 0.5720 (0.6008) loss 7.4262 (7.1343) grad_norm 3.2339 (2.4439) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:20:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][150/625] eta 0:04:42 lr 0.000311 wd 0.0500 time 0.5705 (0.5940) data time 0.0009 (0.0034) model time 0.5696 (0.5980) loss 8.7746 (7.1722) grad_norm 2.1949 (2.4355) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:20:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][160/625] eta 0:04:35 lr 0.000311 wd 0.0500 time 0.5724 (0.5928) data time 0.0006 (0.0033) model time 0.5718 (0.5958) loss 7.7264 (7.1605) grad_norm 2.4217 (2.4750) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:20:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][170/625] eta 0:04:29 lr 0.000311 wd 0.0500 time 0.5722 (0.5917) data time 0.0008 (0.0031) model time 0.5713 (0.5939) loss 6.5072 (7.1386) grad_norm 4.1422 (2.5657) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:20:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][180/625] eta 0:04:22 lr 0.000311 wd 0.0500 time 0.5708 (0.5907) data time 0.0008 (0.0030) model time 0.5700 (0.5923) loss 8.1020 (7.1356) grad_norm 3.2859 (2.5904) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:20:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][190/625] eta 0:04:16 lr 0.000311 wd 0.0500 time 0.5612 (0.5898) data time 0.0006 (0.0029) model time 0.5606 (0.5909) loss 7.0930 (7.1303) grad_norm 2.2174 (2.5872) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:20:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][200/625] eta 0:04:10 lr 0.000311 wd 0.0500 time 0.5708 (0.5890) data time 0.0007 (0.0028) model time 0.5701 (0.5897) loss 7.4834 (7.1483) grad_norm 1.8653 (2.5735) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:21:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][210/625] eta 0:04:04 lr 0.000311 wd 0.0500 time 0.5727 (0.5884) data time 0.0010 (0.0027) model time 0.5717 (0.5888) loss 8.8415 (7.1557) grad_norm 1.9814 (2.6012) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:21:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][220/625] eta 0:03:58 lr 0.000311 wd 0.0500 time 0.5754 (0.5878) data time 0.0008 (0.0026) model time 0.5747 (0.5879) loss 7.0393 (7.1653) grad_norm 1.9533 (2.6125) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:21:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][230/625] eta 0:03:51 lr 0.000311 wd 0.0500 time 0.5724 (0.5872) data time 0.0008 (0.0026) model time 0.5716 (0.5871) loss 6.5558 (7.1859) grad_norm 2.5235 (2.6040) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:21:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][240/625] eta 0:03:45 lr 0.000311 wd 0.0500 time 0.5744 (0.5867) data time 0.0006 (0.0025) model time 0.5738 (0.5865) loss 6.6357 (7.1914) grad_norm 2.1224 (2.5881) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:21:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][250/625] eta 0:03:39 lr 0.000311 wd 0.0500 time 0.5709 (0.5862) data time 0.0007 (0.0024) model time 0.5702 (0.5857) loss 6.4319 (7.1912) grad_norm 3.2364 (2.5918) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:21:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][260/625] eta 0:03:33 lr 0.000310 wd 0.0500 time 0.5736 (0.5857) data time 0.0006 (0.0024) model time 0.5730 (0.5852) loss 5.7648 (7.1764) grad_norm 1.9740 (2.5928) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:21:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][270/625] eta 0:03:27 lr 0.000310 wd 0.0500 time 0.5704 (0.5855) data time 0.0008 (0.0023) model time 0.5696 (0.5849) loss 7.7640 (7.1751) grad_norm 2.1376 (2.5928) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:21:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][280/625] eta 0:03:22 lr 0.000310 wd 0.0500 time 0.7013 (0.5862) data time 0.0008 (0.0023) model time 0.7005 (0.5857) loss 7.8584 (7.1642) grad_norm 6.0155 (2.5924) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:21:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][290/625] eta 0:03:16 lr 0.000310 wd 0.0500 time 0.5813 (0.5860) data time 0.0009 (0.0022) model time 0.5804 (0.5855) loss 7.6309 (7.1564) grad_norm 2.9435 (2.5963) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:21:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][300/625] eta 0:03:11 lr 0.000310 wd 0.0500 time 0.7385 (0.5882) data time 0.0006 (0.0022) model time 0.7380 (0.5881) loss 6.1584 (7.1606) grad_norm 2.0186 (2.5959) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:21:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][310/625] eta 0:03:05 lr 0.000310 wd 0.0500 time 0.6242 (0.5900) data time 0.0006 (0.0021) model time 0.6236 (0.5902) loss 7.5827 (7.1463) grad_norm 3.8109 (2.5862) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:22:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][320/625] eta 0:02:59 lr 0.000310 wd 0.0500 time 0.5677 (0.5899) data time 0.0008 (0.0021) model time 0.5669 (0.5900) loss 7.4612 (7.1512) grad_norm 3.7979 (2.5841) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:22:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][330/625] eta 0:02:54 lr 0.000310 wd 0.0500 time 0.5697 (0.5907) data time 0.0006 (0.0020) model time 0.5690 (0.5910) loss 5.8600 (7.1481) grad_norm 2.6140 (2.5732) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:22:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][340/625] eta 0:02:48 lr 0.000310 wd 0.0500 time 0.5714 (0.5904) data time 0.0007 (0.0020) model time 0.5707 (0.5906) loss 6.2118 (7.1557) grad_norm 2.0662 (2.5708) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:22:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][350/625] eta 0:02:42 lr 0.000310 wd 0.0500 time 0.5744 (0.5899) data time 0.0007 (0.0020) model time 0.5738 (0.5900) loss 7.2140 (7.1527) grad_norm 4.8967 (2.5906) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:22:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][360/625] eta 0:02:36 lr 0.000310 wd 0.0500 time 0.5747 (0.5895) data time 0.0006 (0.0020) model time 0.5741 (0.5895) loss 6.5820 (7.1532) grad_norm 2.7710 (2.5936) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:22:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][370/625] eta 0:02:30 lr 0.000309 wd 0.0500 time 0.5760 (0.5892) data time 0.0006 (0.0019) model time 0.5754 (0.5890) loss 6.7647 (7.1486) grad_norm 2.7904 (2.5935) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:22:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][380/625] eta 0:02:24 lr 0.000309 wd 0.0500 time 0.5771 (0.5888) data time 0.0007 (0.0019) model time 0.5764 (0.5886) loss 7.8239 (7.1564) grad_norm 3.3370 (2.5895) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:22:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][390/625] eta 0:02:18 lr 0.000309 wd 0.0500 time 0.5728 (0.5884) data time 0.0008 (0.0019) model time 0.5720 (0.5882) loss 8.0196 (7.1651) grad_norm 4.1371 (2.5886) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:22:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][400/625] eta 0:02:12 lr 0.000309 wd 0.0500 time 0.5684 (0.5881) data time 0.0008 (0.0018) model time 0.5676 (0.5878) loss 6.8457 (7.1692) grad_norm 2.7627 (2.5919) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:22:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][410/625] eta 0:02:06 lr 0.000309 wd 0.0500 time 0.5729 (0.5878) data time 0.0006 (0.0018) model time 0.5722 (0.5874) loss 7.6319 (7.1759) grad_norm 2.1427 (2.5866) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:23:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][420/625] eta 0:02:00 lr 0.000309 wd 0.0500 time 0.5720 (0.5875) data time 0.0006 (0.0018) model time 0.5714 (0.5870) loss 5.3201 (7.1812) grad_norm 3.5473 (2.5802) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:23:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][430/625] eta 0:01:54 lr 0.000309 wd 0.0500 time 0.5725 (0.5872) data time 0.0008 (0.0018) model time 0.5717 (0.5867) loss 7.1649 (7.1860) grad_norm 2.0626 (2.5672) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:23:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][440/625] eta 0:01:48 lr 0.000309 wd 0.0500 time 0.5702 (0.5869) data time 0.0006 (0.0018) model time 0.5696 (0.5864) loss 8.4292 (7.1855) grad_norm 1.9111 (2.5582) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:23:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][450/625] eta 0:01:42 lr 0.000309 wd 0.0500 time 0.5746 (0.5867) data time 0.0008 (0.0017) model time 0.5738 (0.5861) loss 6.7861 (7.1866) grad_norm 2.0470 (2.5605) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:23:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][460/625] eta 0:01:36 lr 0.000309 wd 0.0500 time 0.5681 (0.5864) data time 0.0006 (0.0017) model time 0.5675 (0.5858) loss 7.0348 (7.1870) grad_norm 1.8408 (2.5564) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:23:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][470/625] eta 0:01:30 lr 0.000309 wd 0.0500 time 0.5712 (0.5862) data time 0.0006 (0.0017) model time 0.5706 (0.5855) loss 5.7984 (7.1888) grad_norm 2.2741 (2.5735) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:23:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][480/625] eta 0:01:24 lr 0.000308 wd 0.0500 time 0.5747 (0.5859) data time 0.0009 (0.0017) model time 0.5739 (0.5852) loss 8.0293 (7.1950) grad_norm 2.5269 (2.5813) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:23:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][490/625] eta 0:01:19 lr 0.000308 wd 0.0500 time 0.5728 (0.5859) data time 0.0008 (0.0017) model time 0.5720 (0.5852) loss 8.0343 (7.2011) grad_norm 2.2820 (2.5841) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:23:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][500/625] eta 0:01:13 lr 0.000308 wd 0.0500 time 0.5702 (0.5861) data time 0.0007 (0.0017) model time 0.5694 (0.5855) loss 8.6691 (7.2056) grad_norm 8.0269 (2.5953) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:23:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][510/625] eta 0:01:07 lr 0.000308 wd 0.0500 time 0.5747 (0.5861) data time 0.0008 (0.0017) model time 0.5739 (0.5854) loss 7.2638 (7.2056) grad_norm 3.4543 (2.5964) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:24:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][520/625] eta 0:01:01 lr 0.000308 wd 0.0500 time 0.7449 (0.5876) data time 0.0006 (0.0016) model time 0.7443 (0.5870) loss 8.4517 (7.2067) grad_norm 3.4040 (2.5988) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:24:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][530/625] eta 0:00:55 lr 0.000308 wd 0.0500 time 0.6708 (0.5880) data time 0.0008 (0.0016) model time 0.6700 (0.5875) loss 6.3265 (7.2044) grad_norm 1.9746 (2.5923) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:24:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][540/625] eta 0:00:49 lr 0.000308 wd 0.0500 time 0.5734 (0.5880) data time 0.0007 (0.0016) model time 0.5728 (0.5875) loss 7.9716 (7.2104) grad_norm 1.8912 (2.5867) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:24:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][550/625] eta 0:00:44 lr 0.000308 wd 0.0500 time 0.5726 (0.5886) data time 0.0008 (0.0016) model time 0.5719 (0.5881) loss 6.3268 (7.2121) grad_norm 1.9230 (2.5781) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:24:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][560/625] eta 0:00:38 lr 0.000308 wd 0.0500 time 0.5708 (0.5884) data time 0.0007 (0.0016) model time 0.5701 (0.5879) loss 7.6091 (7.2080) grad_norm 3.1345 (2.5723) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:24:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][570/625] eta 0:00:32 lr 0.000308 wd 0.0500 time 0.5752 (0.5882) data time 0.0006 (0.0016) model time 0.5746 (0.5877) loss 6.5651 (7.2128) grad_norm 3.2264 (2.5784) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:24:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][580/625] eta 0:00:26 lr 0.000307 wd 0.0500 time 0.5729 (0.5879) data time 0.0008 (0.0016) model time 0.5721 (0.5874) loss 7.0084 (7.2112) grad_norm 3.9977 (2.5946) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:24:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][590/625] eta 0:00:20 lr 0.000307 wd 0.0500 time 0.5716 (0.5877) data time 0.0007 (0.0015) model time 0.5710 (0.5872) loss 7.6450 (7.2123) grad_norm 4.2820 (2.6138) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:24:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][600/625] eta 0:00:14 lr 0.000307 wd 0.0500 time 0.5752 (0.5875) data time 0.0006 (0.0015) model time 0.5746 (0.5869) loss 5.8959 (7.2160) grad_norm 2.0589 (2.6176) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:24:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][610/625] eta 0:00:08 lr 0.000307 wd 0.0500 time 0.5708 (0.5873) data time 0.0006 (0.0015) model time 0.5702 (0.5867) loss 6.5880 (7.2154) grad_norm 2.5694 (2.6322) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:25:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [206/300][620/625] eta 0:00:02 lr 0.000307 wd 0.0500 time 0.5716 (0.5871) data time 0.0004 (0.0015) model time 0.5712 (0.5865) loss 6.8588 (7.2165) grad_norm 2.1501 (2.6273) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:25:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 206 training takes 0:06:06 +[2024-07-25 11:25:02 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 11:25:04 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 11:25:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.476 (0.476) Loss 0.5132 (0.5132) Acc@1 89.990 (89.990) Acc@5 98.779 (98.779) Mem 22339MB +[2024-07-25 11:25:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.7563 (0.6253) Acc@1 83.008 (87.305) Acc@5 96.729 (97.971) Mem 22339MB +[2024-07-25 11:25:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8657 (0.7194) Acc@1 79.297 (84.524) Acc@5 96.094 (97.082) Mem 22339MB +[2024-07-25 11:25:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.101 Acc@5 97.053 +[2024-07-25 11:25:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.1% +[2024-07-25 11:25:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.787 (0.787) Loss 0.5039 (0.5039) Acc@1 90.381 (90.381) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 11:25:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.186) Loss 0.7505 (0.6212) Acc@1 83.350 (87.518) Acc@5 96.631 (98.002) Mem 22339MB +[2024-07-25 11:25:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.157) Loss 0.8667 (0.7123) Acc@1 78.955 (84.542) Acc@5 96.045 (97.133) Mem 22339MB +[2024-07-25 11:25:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.171 Acc@5 97.109 +[2024-07-25 11:25:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.2% +[2024-07-25 11:25:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.17% +[2024-07-25 11:25:12 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 11:25:13 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 11:25:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][0/625] eta 0:09:10 lr 0.000307 wd 0.0500 time 0.8813 (0.8813) data time 0.3645 (0.3645) model time 0.0000 (0.0000) loss 7.7756 (7.7756) grad_norm 2.2980 (2.2980) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:25:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][10/625] eta 0:06:10 lr 0.000307 wd 0.0500 time 0.5738 (0.6025) data time 0.0007 (0.0339) model time 0.0000 (0.0000) loss 7.0837 (7.9861) grad_norm 2.1293 (2.0092) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:25:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][20/625] eta 0:05:55 lr 0.000307 wd 0.0500 time 0.5704 (0.5884) data time 0.0007 (0.0181) model time 0.0000 (0.0000) loss 7.8394 (7.6734) grad_norm 2.6932 (2.5399) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:25:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][30/625] eta 0:05:46 lr 0.000307 wd 0.0500 time 0.5705 (0.5831) data time 0.0007 (0.0126) model time 0.0000 (0.0000) loss 7.9949 (7.4944) grad_norm 1.8970 (2.6041) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:25:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][40/625] eta 0:05:40 lr 0.000307 wd 0.0500 time 0.5740 (0.5812) data time 0.0006 (0.0097) model time 0.0000 (0.0000) loss 6.6224 (7.4195) grad_norm 2.3313 (2.5654) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-25 11:25:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][50/625] eta 0:05:33 lr 0.000307 wd 0.0500 time 0.5686 (0.5797) data time 0.0006 (0.0079) model time 0.0000 (0.0000) loss 6.9417 (7.4425) grad_norm 1.9409 (2.5912) loss_scale 512.0000 (301.1765) mem 22339MB +[2024-07-25 11:25:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][60/625] eta 0:05:26 lr 0.000307 wd 0.0500 time 0.5773 (0.5787) data time 0.0006 (0.0068) model time 0.5767 (0.5733) loss 6.5853 (7.4193) grad_norm 2.1913 (2.5562) loss_scale 512.0000 (335.7377) mem 22339MB +[2024-07-25 11:25:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][70/625] eta 0:05:20 lr 0.000306 wd 0.0500 time 0.5698 (0.5781) data time 0.0006 (0.0059) model time 0.5692 (0.5734) loss 7.8593 (7.4080) grad_norm 1.8914 (2.4977) loss_scale 512.0000 (360.5634) mem 22339MB +[2024-07-25 11:26:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][80/625] eta 0:05:14 lr 0.000306 wd 0.0500 time 0.5695 (0.5776) data time 0.0006 (0.0053) model time 0.5689 (0.5732) loss 6.7760 (7.4404) grad_norm 1.6919 (2.4915) loss_scale 512.0000 (379.2593) mem 22339MB +[2024-07-25 11:26:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][90/625] eta 0:05:10 lr 0.000306 wd 0.0500 time 0.5717 (0.5800) data time 0.0006 (0.0048) model time 0.5711 (0.5794) loss 6.2431 (7.4276) grad_norm 2.4381 (2.4534) loss_scale 512.0000 (393.8462) mem 22339MB +[2024-07-25 11:26:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][100/625] eta 0:05:05 lr 0.000306 wd 0.0500 time 0.5728 (0.5813) data time 0.0006 (0.0045) model time 0.5722 (0.5821) loss 8.0067 (7.4199) grad_norm 1.8836 (2.4114) loss_scale 512.0000 (405.5446) mem 22339MB +[2024-07-25 11:26:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][110/625] eta 0:05:01 lr 0.000306 wd 0.0500 time 0.7486 (0.5852) data time 0.0008 (0.0042) model time 0.7477 (0.5890) loss 7.4920 (7.3790) grad_norm 1.5138 (2.3966) loss_scale 512.0000 (415.1351) mem 22339MB +[2024-07-25 11:26:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][120/625] eta 0:04:58 lr 0.000306 wd 0.0500 time 0.6416 (0.5907) data time 0.0006 (0.0039) model time 0.6410 (0.5978) loss 7.4346 (7.3832) grad_norm 1.9373 (2.3731) loss_scale 512.0000 (423.1405) mem 22339MB +[2024-07-25 11:26:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][130/625] eta 0:04:53 lr 0.000306 wd 0.0500 time 0.5687 (0.5922) data time 0.0008 (0.0036) model time 0.5679 (0.5993) loss 6.2840 (7.3467) grad_norm 2.7072 (2.3539) loss_scale 512.0000 (429.9237) mem 22339MB +[2024-07-25 11:26:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][140/625] eta 0:04:46 lr 0.000306 wd 0.0500 time 0.5729 (0.5909) data time 0.0008 (0.0035) model time 0.5720 (0.5964) loss 7.3264 (7.3430) grad_norm 2.0067 (2.3677) loss_scale 512.0000 (435.7447) mem 22339MB +[2024-07-25 11:26:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][150/625] eta 0:04:41 lr 0.000306 wd 0.0500 time 0.6998 (0.5927) data time 0.0008 (0.0033) model time 0.6990 (0.5985) loss 6.6891 (7.3572) grad_norm 1.9857 (2.3719) loss_scale 512.0000 (440.7947) mem 22339MB +[2024-07-25 11:26:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][160/625] eta 0:04:35 lr 0.000306 wd 0.0500 time 0.5621 (0.5916) data time 0.0009 (0.0031) model time 0.5613 (0.5962) loss 7.5533 (7.3550) grad_norm 1.9707 (2.4112) loss_scale 512.0000 (445.2174) mem 22339MB +[2024-07-25 11:26:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][170/625] eta 0:04:28 lr 0.000306 wd 0.0500 time 0.5691 (0.5906) data time 0.0008 (0.0030) model time 0.5683 (0.5943) loss 8.6096 (7.3522) grad_norm 1.8669 (2.4145) loss_scale 512.0000 (449.1228) mem 22339MB +[2024-07-25 11:27:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][180/625] eta 0:04:22 lr 0.000305 wd 0.0500 time 0.5686 (0.5897) data time 0.0007 (0.0029) model time 0.5678 (0.5928) loss 7.9271 (7.3371) grad_norm 2.0993 (2.4094) loss_scale 512.0000 (452.5967) mem 22339MB +[2024-07-25 11:27:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][190/625] eta 0:04:16 lr 0.000305 wd 0.0500 time 0.5718 (0.5889) data time 0.0006 (0.0028) model time 0.5712 (0.5913) loss 6.9221 (7.3192) grad_norm 1.8697 (2.4119) loss_scale 512.0000 (455.7068) mem 22339MB +[2024-07-25 11:27:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][200/625] eta 0:04:09 lr 0.000305 wd 0.0500 time 0.5729 (0.5881) data time 0.0006 (0.0027) model time 0.5723 (0.5901) loss 7.4211 (7.3311) grad_norm 2.2278 (2.3965) loss_scale 512.0000 (458.5075) mem 22339MB +[2024-07-25 11:27:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][210/625] eta 0:04:03 lr 0.000305 wd 0.0500 time 0.5723 (0.5875) data time 0.0006 (0.0026) model time 0.5716 (0.5891) loss 7.0274 (7.3206) grad_norm 2.3925 (2.3937) loss_scale 512.0000 (461.0427) mem 22339MB +[2024-07-25 11:27:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][220/625] eta 0:03:57 lr 0.000305 wd 0.0500 time 0.5745 (0.5869) data time 0.0006 (0.0025) model time 0.5740 (0.5882) loss 7.7191 (7.3151) grad_norm 1.8349 (2.3917) loss_scale 512.0000 (463.3484) mem 22339MB +[2024-07-25 11:27:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][230/625] eta 0:03:51 lr 0.000305 wd 0.0500 time 0.5737 (0.5864) data time 0.0006 (0.0024) model time 0.5731 (0.5874) loss 8.0959 (7.3112) grad_norm 2.8211 (2.3977) loss_scale 512.0000 (465.4545) mem 22339MB +[2024-07-25 11:27:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][240/625] eta 0:03:45 lr 0.000305 wd 0.0500 time 0.5752 (0.5859) data time 0.0008 (0.0024) model time 0.5743 (0.5867) loss 8.0564 (7.3035) grad_norm 2.4185 (2.4010) loss_scale 512.0000 (467.3859) mem 22339MB +[2024-07-25 11:27:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][250/625] eta 0:03:39 lr 0.000305 wd 0.0500 time 0.5729 (0.5855) data time 0.0006 (0.0023) model time 0.5723 (0.5861) loss 6.6554 (7.3027) grad_norm 1.9104 (2.3992) loss_scale 512.0000 (469.1633) mem 22339MB +[2024-07-25 11:27:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][260/625] eta 0:03:33 lr 0.000305 wd 0.0500 time 0.5710 (0.5851) data time 0.0008 (0.0023) model time 0.5703 (0.5855) loss 6.0534 (7.2976) grad_norm 2.7867 (2.4033) loss_scale 512.0000 (470.8046) mem 22339MB +[2024-07-25 11:27:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][270/625] eta 0:03:27 lr 0.000305 wd 0.0500 time 0.5748 (0.5847) data time 0.0006 (0.0022) model time 0.5741 (0.5850) loss 6.8039 (7.3090) grad_norm 1.5513 (2.3887) loss_scale 512.0000 (472.3247) mem 22339MB +[2024-07-25 11:27:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][280/625] eta 0:03:21 lr 0.000305 wd 0.0500 time 0.5666 (0.5844) data time 0.0008 (0.0022) model time 0.5658 (0.5845) loss 7.4976 (7.3020) grad_norm 2.5777 (2.3863) loss_scale 512.0000 (473.7367) mem 22339MB +[2024-07-25 11:28:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][290/625] eta 0:03:15 lr 0.000304 wd 0.0500 time 0.5702 (0.5840) data time 0.0007 (0.0021) model time 0.5695 (0.5841) loss 7.7761 (7.3075) grad_norm 4.5485 (2.3965) loss_scale 512.0000 (475.0515) mem 22339MB +[2024-07-25 11:28:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][300/625] eta 0:03:09 lr 0.000304 wd 0.0500 time 0.7145 (0.5843) data time 0.0006 (0.0021) model time 0.7138 (0.5843) loss 6.3112 (7.2997) grad_norm 2.8867 (2.4238) loss_scale 512.0000 (476.2791) mem 22339MB +[2024-07-25 11:28:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][310/625] eta 0:03:04 lr 0.000304 wd 0.0500 time 0.7089 (0.5846) data time 0.0007 (0.0020) model time 0.7083 (0.5847) loss 6.6423 (7.3037) grad_norm 1.8711 (2.4309) loss_scale 512.0000 (477.4277) mem 22339MB +[2024-07-25 11:28:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][320/625] eta 0:02:58 lr 0.000304 wd 0.0500 time 0.5674 (0.5845) data time 0.0006 (0.0020) model time 0.5668 (0.5845) loss 7.3550 (7.3011) grad_norm 2.7687 (2.4389) loss_scale 512.0000 (478.5047) mem 22339MB +[2024-07-25 11:28:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][330/625] eta 0:02:52 lr 0.000304 wd 0.0500 time 0.7440 (0.5855) data time 0.0009 (0.0020) model time 0.7431 (0.5857) loss 7.6589 (7.2940) grad_norm 1.7519 (2.4547) loss_scale 512.0000 (479.5166) mem 22339MB +[2024-07-25 11:28:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][340/625] eta 0:02:47 lr 0.000304 wd 0.0500 time 0.6356 (0.5865) data time 0.0006 (0.0019) model time 0.6350 (0.5868) loss 6.7077 (7.2911) grad_norm 2.3819 (2.4584) loss_scale 512.0000 (480.4692) mem 22339MB +[2024-07-25 11:28:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][350/625] eta 0:02:41 lr 0.000304 wd 0.0500 time 0.5677 (0.5869) data time 0.0008 (0.0019) model time 0.5669 (0.5872) loss 8.1283 (7.2900) grad_norm 1.8856 (2.4772) loss_scale 512.0000 (481.3675) mem 22339MB +[2024-07-25 11:28:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][360/625] eta 0:02:35 lr 0.000304 wd 0.0500 time 0.7178 (0.5869) data time 0.0008 (0.0019) model time 0.7170 (0.5872) loss 7.2307 (7.2893) grad_norm 1.9456 (2.4743) loss_scale 512.0000 (482.2161) mem 22339MB +[2024-07-25 11:28:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][370/625] eta 0:02:29 lr 0.000304 wd 0.0500 time 0.5707 (0.5870) data time 0.0008 (0.0018) model time 0.5699 (0.5873) loss 7.7129 (7.2914) grad_norm 2.2963 (2.4734) loss_scale 512.0000 (483.0189) mem 22339MB +[2024-07-25 11:28:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][380/625] eta 0:02:23 lr 0.000304 wd 0.0500 time 0.5762 (0.5867) data time 0.0007 (0.0018) model time 0.5755 (0.5870) loss 6.9906 (7.2846) grad_norm 2.3492 (2.4738) loss_scale 512.0000 (483.7795) mem 22339MB +[2024-07-25 11:29:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][390/625] eta 0:02:17 lr 0.000303 wd 0.0500 time 0.5733 (0.5864) data time 0.0006 (0.0018) model time 0.5727 (0.5866) loss 6.3786 (7.2782) grad_norm 2.1072 (2.4724) loss_scale 512.0000 (484.5013) mem 22339MB +[2024-07-25 11:29:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][400/625] eta 0:02:11 lr 0.000303 wd 0.0500 time 0.5628 (0.5861) data time 0.0008 (0.0018) model time 0.5620 (0.5862) loss 7.7836 (7.2777) grad_norm 1.9077 (2.4796) loss_scale 512.0000 (485.1870) mem 22339MB +[2024-07-25 11:29:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][410/625] eta 0:02:05 lr 0.000303 wd 0.0500 time 0.5767 (0.5859) data time 0.0007 (0.0018) model time 0.5759 (0.5859) loss 6.9171 (7.2792) grad_norm 13.8149 (2.5280) loss_scale 512.0000 (485.8394) mem 22339MB +[2024-07-25 11:29:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][420/625] eta 0:02:00 lr 0.000303 wd 0.0500 time 0.5755 (0.5856) data time 0.0008 (0.0017) model time 0.5747 (0.5855) loss 7.7097 (7.2714) grad_norm 1.7873 (2.5311) loss_scale 512.0000 (486.4608) mem 22339MB +[2024-07-25 11:29:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][430/625] eta 0:01:54 lr 0.000303 wd 0.0500 time 0.5730 (0.5853) data time 0.0006 (0.0017) model time 0.5724 (0.5852) loss 6.7459 (7.2680) grad_norm 3.6620 (2.5347) loss_scale 512.0000 (487.0534) mem 22339MB +[2024-07-25 11:29:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][440/625] eta 0:01:48 lr 0.000303 wd 0.0500 time 0.5644 (0.5851) data time 0.0007 (0.0017) model time 0.5637 (0.5850) loss 7.5673 (7.2682) grad_norm 2.6903 (2.5311) loss_scale 512.0000 (487.6190) mem 22339MB +[2024-07-25 11:29:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][450/625] eta 0:01:42 lr 0.000303 wd 0.0500 time 0.5716 (0.5849) data time 0.0006 (0.0017) model time 0.5710 (0.5847) loss 7.2080 (7.2762) grad_norm 2.8499 (2.5308) loss_scale 512.0000 (488.1596) mem 22339MB +[2024-07-25 11:29:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][460/625] eta 0:01:36 lr 0.000303 wd 0.0500 time 0.5748 (0.5846) data time 0.0006 (0.0017) model time 0.5742 (0.5844) loss 6.5031 (7.2703) grad_norm 2.4030 (2.5228) loss_scale 512.0000 (488.6768) mem 22339MB +[2024-07-25 11:29:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][470/625] eta 0:01:30 lr 0.000303 wd 0.0500 time 0.5716 (0.5844) data time 0.0006 (0.0016) model time 0.5710 (0.5841) loss 6.6489 (7.2633) grad_norm 1.6830 (2.5457) loss_scale 512.0000 (489.1720) mem 22339MB +[2024-07-25 11:29:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][480/625] eta 0:01:24 lr 0.000303 wd 0.0500 time 0.5727 (0.5842) data time 0.0008 (0.0016) model time 0.5719 (0.5838) loss 7.5398 (7.2668) grad_norm 2.2993 (2.5441) loss_scale 512.0000 (489.6466) mem 22339MB +[2024-07-25 11:30:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][490/625] eta 0:01:18 lr 0.000303 wd 0.0500 time 0.5707 (0.5840) data time 0.0008 (0.0016) model time 0.5698 (0.5836) loss 8.3854 (7.2740) grad_norm 1.6668 (2.5410) loss_scale 512.0000 (490.1018) mem 22339MB +[2024-07-25 11:30:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][500/625] eta 0:01:12 lr 0.000302 wd 0.0500 time 0.5712 (0.5838) data time 0.0006 (0.0016) model time 0.5706 (0.5834) loss 7.2035 (7.2773) grad_norm 2.2926 (2.5374) loss_scale 512.0000 (490.5389) mem 22339MB +[2024-07-25 11:30:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][510/625] eta 0:01:07 lr 0.000302 wd 0.0500 time 0.5765 (0.5836) data time 0.0008 (0.0016) model time 0.5757 (0.5831) loss 7.6643 (7.2785) grad_norm 2.2512 (2.5556) loss_scale 512.0000 (490.9589) mem 22339MB +[2024-07-25 11:30:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][520/625] eta 0:01:01 lr 0.000302 wd 0.0500 time 0.5726 (0.5834) data time 0.0006 (0.0016) model time 0.5719 (0.5829) loss 8.2181 (7.2782) grad_norm 1.7694 (2.5554) loss_scale 512.0000 (491.3628) mem 22339MB +[2024-07-25 11:30:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][530/625] eta 0:00:55 lr 0.000302 wd 0.0500 time 0.5657 (0.5838) data time 0.0009 (0.0016) model time 0.5648 (0.5833) loss 7.0284 (7.2743) grad_norm 3.4374 (2.5527) loss_scale 512.0000 (491.7514) mem 22339MB +[2024-07-25 11:30:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][540/625] eta 0:00:49 lr 0.000302 wd 0.0500 time 0.5739 (0.5839) data time 0.0007 (0.0015) model time 0.5732 (0.5835) loss 8.2156 (7.2755) grad_norm 2.1474 (2.5545) loss_scale 512.0000 (492.1257) mem 22339MB +[2024-07-25 11:30:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][550/625] eta 0:00:43 lr 0.000302 wd 0.0500 time 0.7602 (0.5848) data time 0.0008 (0.0015) model time 0.7594 (0.5845) loss 7.8691 (7.2768) grad_norm 2.3992 (2.5539) loss_scale 512.0000 (492.4864) mem 22339MB +[2024-07-25 11:30:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][560/625] eta 0:00:38 lr 0.000302 wd 0.0500 time 0.7040 (0.5853) data time 0.0006 (0.0015) model time 0.7034 (0.5850) loss 7.4058 (7.2746) grad_norm 1.7234 (2.5475) loss_scale 512.0000 (492.8342) mem 22339MB +[2024-07-25 11:30:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][570/625] eta 0:00:32 lr 0.000302 wd 0.0500 time 0.7167 (0.5860) data time 0.0008 (0.0015) model time 0.7158 (0.5857) loss 8.6776 (7.2746) grad_norm 2.0238 (2.5417) loss_scale 512.0000 (493.1699) mem 22339MB +[2024-07-25 11:30:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][580/625] eta 0:00:26 lr 0.000302 wd 0.0500 time 0.7212 (0.5860) data time 0.0006 (0.0015) model time 0.7206 (0.5858) loss 6.0947 (7.2737) grad_norm 2.0031 (2.5466) loss_scale 512.0000 (493.4940) mem 22339MB +[2024-07-25 11:31:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][590/625] eta 0:00:20 lr 0.000302 wd 0.0500 time 0.5708 (0.5862) data time 0.0008 (0.0015) model time 0.5700 (0.5860) loss 6.5059 (7.2739) grad_norm 2.4162 (2.5487) loss_scale 512.0000 (493.8071) mem 22339MB +[2024-07-25 11:31:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][600/625] eta 0:00:14 lr 0.000302 wd 0.0500 time 0.5743 (0.5860) data time 0.0007 (0.0015) model time 0.5736 (0.5858) loss 7.8047 (7.2714) grad_norm 2.2637 (2.5411) loss_scale 512.0000 (494.1098) mem 22339MB +[2024-07-25 11:31:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][610/625] eta 0:00:08 lr 0.000301 wd 0.0500 time 0.5709 (0.5858) data time 0.0004 (0.0015) model time 0.5705 (0.5855) loss 6.8501 (7.2708) grad_norm 1.7219 (2.5434) loss_scale 512.0000 (494.4026) mem 22339MB +[2024-07-25 11:31:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [207/300][620/625] eta 0:00:02 lr 0.000301 wd 0.0500 time 0.5787 (0.5857) data time 0.0006 (0.0015) model time 0.5782 (0.5853) loss 7.0242 (7.2707) grad_norm 2.7904 (2.5399) loss_scale 512.0000 (494.6860) mem 22339MB +[2024-07-25 11:31:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 207 training takes 0:06:05 +[2024-07-25 11:31:19 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 11:31:21 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 11:31:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.480 (0.480) Loss 0.5068 (0.5068) Acc@1 90.137 (90.137) Acc@5 98.584 (98.584) Mem 22339MB +[2024-07-25 11:31:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.158) Loss 0.7642 (0.6216) Acc@1 82.471 (87.385) Acc@5 97.119 (97.949) Mem 22339MB +[2024-07-25 11:31:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8706 (0.7170) Acc@1 79.346 (84.501) Acc@5 96.191 (97.077) Mem 22339MB +[2024-07-25 11:31:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.115 Acc@5 97.073 +[2024-07-25 11:31:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.1% +[2024-07-25 11:31:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.808 (0.808) Loss 0.5049 (0.5049) Acc@1 90.332 (90.332) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-25 11:31:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.188) Loss 0.7510 (0.6214) Acc@1 83.350 (87.540) Acc@5 96.631 (98.002) Mem 22339MB +[2024-07-25 11:31:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.158) Loss 0.8667 (0.7123) Acc@1 79.053 (84.577) Acc@5 95.947 (97.138) Mem 22339MB +[2024-07-25 11:31:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.199 Acc@5 97.115 +[2024-07-25 11:31:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.2% +[2024-07-25 11:31:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.20% +[2024-07-25 11:31:28 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 11:31:29 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 11:31:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][0/625] eta 0:09:40 lr 0.000301 wd 0.0500 time 0.9284 (0.9284) data time 0.4102 (0.4102) model time 0.0000 (0.0000) loss 6.6212 (6.6212) grad_norm 3.3148 (3.3148) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:31:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][10/625] eta 0:06:12 lr 0.000301 wd 0.0500 time 0.5715 (0.6051) data time 0.0006 (0.0379) model time 0.0000 (0.0000) loss 7.6719 (7.3185) grad_norm 1.9288 (2.3241) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:31:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][20/625] eta 0:05:57 lr 0.000301 wd 0.0500 time 0.5684 (0.5907) data time 0.0008 (0.0203) model time 0.0000 (0.0000) loss 8.4134 (7.2658) grad_norm 2.0718 (2.2928) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:31:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][30/625] eta 0:05:48 lr 0.000301 wd 0.0500 time 0.5721 (0.5855) data time 0.0006 (0.0140) model time 0.0000 (0.0000) loss 7.6653 (7.2705) grad_norm 4.2592 (2.3785) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:31:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][40/625] eta 0:05:41 lr 0.000301 wd 0.0500 time 0.5718 (0.5830) data time 0.0006 (0.0108) model time 0.0000 (0.0000) loss 7.9309 (7.3073) grad_norm 1.7951 (2.4093) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:31:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][50/625] eta 0:05:34 lr 0.000301 wd 0.0500 time 0.5200 (0.5826) data time 0.0008 (0.0088) model time 0.0000 (0.0000) loss 7.0300 (7.3293) grad_norm 1.7833 (2.3232) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:32:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][60/625] eta 0:05:28 lr 0.000301 wd 0.0500 time 0.5740 (0.5813) data time 0.0008 (0.0075) model time 0.5732 (0.5740) loss 6.9002 (7.2923) grad_norm 2.2075 (2.3364) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:32:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][70/625] eta 0:05:22 lr 0.000301 wd 0.0500 time 0.5692 (0.5803) data time 0.0008 (0.0066) model time 0.5684 (0.5737) loss 6.3999 (7.2660) grad_norm 2.5860 (2.3349) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:32:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][80/625] eta 0:05:15 lr 0.000301 wd 0.0500 time 0.5747 (0.5795) data time 0.0006 (0.0059) model time 0.5741 (0.5733) loss 6.9475 (7.2494) grad_norm 1.7830 (2.3818) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:32:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][90/625] eta 0:05:09 lr 0.000301 wd 0.0500 time 0.5712 (0.5789) data time 0.0008 (0.0053) model time 0.5704 (0.5734) loss 7.7808 (7.3236) grad_norm 1.9687 (2.3980) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:32:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][100/625] eta 0:05:03 lr 0.000300 wd 0.0500 time 0.5731 (0.5786) data time 0.0007 (0.0049) model time 0.5724 (0.5736) loss 6.7195 (7.3323) grad_norm 2.0661 (2.4865) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:32:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][110/625] eta 0:04:57 lr 0.000300 wd 0.0500 time 0.5736 (0.5783) data time 0.0008 (0.0045) model time 0.5729 (0.5738) loss 8.2363 (7.3268) grad_norm 3.0286 (2.5400) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:32:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][120/625] eta 0:04:52 lr 0.000300 wd 0.0500 time 0.5728 (0.5790) data time 0.0008 (0.0042) model time 0.5720 (0.5756) loss 6.0004 (7.3037) grad_norm 1.9963 (2.5413) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:32:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][130/625] eta 0:04:47 lr 0.000300 wd 0.0500 time 0.6333 (0.5800) data time 0.0006 (0.0040) model time 0.6327 (0.5775) loss 6.8882 (7.2763) grad_norm 1.8204 (2.5430) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:32:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][140/625] eta 0:04:41 lr 0.000300 wd 0.0500 time 0.5738 (0.5808) data time 0.0008 (0.0037) model time 0.5729 (0.5790) loss 6.2260 (7.2525) grad_norm 4.8408 (2.5471) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:32:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][150/625] eta 0:04:37 lr 0.000300 wd 0.0500 time 0.7597 (0.5852) data time 0.0006 (0.0035) model time 0.7591 (0.5857) loss 7.7970 (7.2742) grad_norm 2.1689 (2.5333) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:33:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][160/625] eta 0:04:33 lr 0.000300 wd 0.0500 time 0.7613 (0.5887) data time 0.0006 (0.0034) model time 0.7607 (0.5907) loss 7.6317 (7.2848) grad_norm 2.3204 (2.5092) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:33:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][170/625] eta 0:04:28 lr 0.000300 wd 0.0500 time 0.5764 (0.5898) data time 0.0008 (0.0032) model time 0.5755 (0.5921) loss 6.6732 (7.3138) grad_norm 1.6581 (2.4912) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:33:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][180/625] eta 0:04:22 lr 0.000300 wd 0.0500 time 0.5727 (0.5909) data time 0.0006 (0.0031) model time 0.5721 (0.5933) loss 6.3274 (7.3075) grad_norm 2.1438 (2.4750) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:33:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][190/625] eta 0:04:17 lr 0.000300 wd 0.0500 time 0.5745 (0.5908) data time 0.0008 (0.0030) model time 0.5737 (0.5930) loss 8.1879 (7.2958) grad_norm 1.9839 (2.4541) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:33:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][200/625] eta 0:04:10 lr 0.000300 wd 0.0500 time 0.5703 (0.5900) data time 0.0006 (0.0029) model time 0.5696 (0.5917) loss 7.4138 (7.2762) grad_norm 2.4902 (2.4654) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:33:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][210/625] eta 0:04:04 lr 0.000299 wd 0.0500 time 0.5702 (0.5893) data time 0.0006 (0.0028) model time 0.5696 (0.5906) loss 8.0167 (7.2823) grad_norm 2.1184 (2.4705) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:33:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][220/625] eta 0:03:58 lr 0.000299 wd 0.0500 time 0.5725 (0.5886) data time 0.0009 (0.0027) model time 0.5716 (0.5895) loss 7.5150 (7.3071) grad_norm 2.0598 (2.4618) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:33:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][230/625] eta 0:03:52 lr 0.000299 wd 0.0500 time 0.5723 (0.5880) data time 0.0006 (0.0026) model time 0.5717 (0.5886) loss 7.4688 (7.2834) grad_norm 2.9015 (2.4486) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:33:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][240/625] eta 0:03:46 lr 0.000299 wd 0.0500 time 0.5736 (0.5874) data time 0.0008 (0.0026) model time 0.5728 (0.5878) loss 7.9143 (7.2902) grad_norm 2.0122 (2.4357) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:33:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][250/625] eta 0:03:40 lr 0.000299 wd 0.0500 time 0.5791 (0.5869) data time 0.0007 (0.0025) model time 0.5784 (0.5871) loss 8.2580 (7.3009) grad_norm 7.5924 (2.4828) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:34:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][260/625] eta 0:03:34 lr 0.000299 wd 0.0500 time 0.5709 (0.5864) data time 0.0008 (0.0024) model time 0.5702 (0.5865) loss 6.9198 (7.2860) grad_norm 1.5852 (2.4734) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:34:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][270/625] eta 0:03:28 lr 0.000299 wd 0.0500 time 0.7075 (0.5865) data time 0.0008 (0.0024) model time 0.7067 (0.5865) loss 9.0165 (7.3024) grad_norm 2.1952 (2.4651) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:34:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][280/625] eta 0:03:22 lr 0.000299 wd 0.0500 time 0.5743 (0.5861) data time 0.0009 (0.0023) model time 0.5734 (0.5860) loss 7.7849 (7.3081) grad_norm 4.0816 (2.4678) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:34:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][290/625] eta 0:03:16 lr 0.000299 wd 0.0500 time 0.5751 (0.5857) data time 0.0006 (0.0023) model time 0.5746 (0.5855) loss 7.9495 (7.3026) grad_norm 2.4418 (2.4653) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:34:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][300/625] eta 0:03:10 lr 0.000299 wd 0.0500 time 0.5740 (0.5853) data time 0.0008 (0.0022) model time 0.5732 (0.5850) loss 7.4436 (7.2984) grad_norm 3.4320 (2.4665) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:34:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][310/625] eta 0:03:04 lr 0.000299 wd 0.0500 time 0.5758 (0.5850) data time 0.0009 (0.0022) model time 0.5749 (0.5846) loss 6.1628 (7.2983) grad_norm 2.8827 (2.4633) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:34:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][320/625] eta 0:02:58 lr 0.000298 wd 0.0500 time 0.5742 (0.5847) data time 0.0008 (0.0021) model time 0.5734 (0.5843) loss 7.8906 (7.3039) grad_norm 2.0920 (2.4574) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:34:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][330/625] eta 0:02:52 lr 0.000298 wd 0.0500 time 0.5749 (0.5845) data time 0.0008 (0.0021) model time 0.5741 (0.5839) loss 6.7438 (7.3103) grad_norm 2.3743 (2.4671) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:34:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][340/625] eta 0:02:46 lr 0.000298 wd 0.0500 time 0.5768 (0.5843) data time 0.0006 (0.0021) model time 0.5762 (0.5838) loss 7.2322 (7.3129) grad_norm 2.3162 (2.4675) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:34:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][350/625] eta 0:02:40 lr 0.000298 wd 0.0500 time 0.6949 (0.5849) data time 0.0006 (0.0020) model time 0.6943 (0.5844) loss 7.3549 (7.3032) grad_norm 2.3753 (2.4675) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:35:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][360/625] eta 0:02:35 lr 0.000298 wd 0.0500 time 0.7679 (0.5854) data time 0.0006 (0.0020) model time 0.7673 (0.5850) loss 7.4828 (7.3137) grad_norm 1.9204 (2.4698) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:35:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][370/625] eta 0:02:29 lr 0.000298 wd 0.0500 time 0.5721 (0.5864) data time 0.0008 (0.0020) model time 0.5714 (0.5861) loss 6.9289 (7.3146) grad_norm 3.6507 (2.4884) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:35:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][380/625] eta 0:02:24 lr 0.000298 wd 0.0500 time 0.7368 (0.5881) data time 0.0008 (0.0019) model time 0.7360 (0.5881) loss 6.8585 (7.3209) grad_norm 3.9996 (2.5025) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:35:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][390/625] eta 0:02:18 lr 0.000298 wd 0.0500 time 0.5745 (0.5884) data time 0.0008 (0.0019) model time 0.5737 (0.5884) loss 7.5512 (7.3159) grad_norm 2.7030 (2.5103) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:35:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][400/625] eta 0:02:12 lr 0.000298 wd 0.0500 time 0.5739 (0.5889) data time 0.0007 (0.0019) model time 0.5732 (0.5889) loss 6.0940 (7.3130) grad_norm 1.8769 (2.5014) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:35:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][410/625] eta 0:02:06 lr 0.000298 wd 0.0500 time 0.5741 (0.5890) data time 0.0006 (0.0019) model time 0.5734 (0.5890) loss 7.9011 (7.3193) grad_norm 3.8794 (2.5094) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:35:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][420/625] eta 0:02:00 lr 0.000298 wd 0.0500 time 0.5664 (0.5886) data time 0.0008 (0.0018) model time 0.5656 (0.5886) loss 6.7228 (7.3131) grad_norm 2.3495 (2.5168) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:35:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][430/625] eta 0:01:54 lr 0.000297 wd 0.0500 time 0.5758 (0.5883) data time 0.0006 (0.0018) model time 0.5752 (0.5882) loss 8.1453 (7.3105) grad_norm 2.3178 (2.5092) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:35:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][440/625] eta 0:01:48 lr 0.000297 wd 0.0500 time 0.5733 (0.5880) data time 0.0008 (0.0018) model time 0.5724 (0.5878) loss 7.0106 (7.3102) grad_norm 2.2618 (2.5174) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:35:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][450/625] eta 0:01:42 lr 0.000297 wd 0.0500 time 0.5710 (0.5877) data time 0.0008 (0.0018) model time 0.5702 (0.5874) loss 7.3689 (7.3054) grad_norm 2.6551 (2.5242) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:36:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][460/625] eta 0:01:36 lr 0.000297 wd 0.0500 time 0.5730 (0.5874) data time 0.0006 (0.0018) model time 0.5723 (0.5871) loss 7.5663 (7.2962) grad_norm 2.3847 (2.5384) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:36:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][470/625] eta 0:01:31 lr 0.000297 wd 0.0500 time 0.5721 (0.5871) data time 0.0006 (0.0017) model time 0.5714 (0.5868) loss 7.2987 (7.2852) grad_norm 3.0646 (2.5355) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:36:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][480/625] eta 0:01:25 lr 0.000297 wd 0.0500 time 0.5737 (0.5868) data time 0.0007 (0.0017) model time 0.5730 (0.5865) loss 6.5365 (7.2784) grad_norm 1.6832 (2.5246) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:36:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][490/625] eta 0:01:19 lr 0.000297 wd 0.0500 time 0.5706 (0.5869) data time 0.0006 (0.0017) model time 0.5700 (0.5866) loss 8.6990 (7.2747) grad_norm 1.8018 (2.5103) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:36:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][500/625] eta 0:01:13 lr 0.000297 wd 0.0500 time 0.5740 (0.5867) data time 0.0008 (0.0017) model time 0.5731 (0.5863) loss 6.9398 (7.2763) grad_norm 3.4625 (2.5049) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:36:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][510/625] eta 0:01:07 lr 0.000297 wd 0.0500 time 0.5681 (0.5864) data time 0.0006 (0.0017) model time 0.5674 (0.5860) loss 7.4126 (7.2706) grad_norm 3.9185 (2.5142) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:36:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][520/625] eta 0:01:01 lr 0.000297 wd 0.0500 time 0.5713 (0.5862) data time 0.0008 (0.0017) model time 0.5705 (0.5857) loss 8.1218 (7.2790) grad_norm 2.3393 (2.5074) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:36:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][530/625] eta 0:00:55 lr 0.000297 wd 0.0500 time 0.5706 (0.5860) data time 0.0007 (0.0016) model time 0.5699 (0.5855) loss 6.9389 (7.2818) grad_norm 2.6169 (2.5008) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:36:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][540/625] eta 0:00:49 lr 0.000296 wd 0.0500 time 0.5761 (0.5858) data time 0.0006 (0.0016) model time 0.5755 (0.5852) loss 7.4860 (7.2847) grad_norm 1.9591 (2.4940) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:36:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][550/625] eta 0:00:43 lr 0.000296 wd 0.0500 time 0.5724 (0.5856) data time 0.0006 (0.0016) model time 0.5718 (0.5850) loss 6.7279 (7.2855) grad_norm 3.2611 (2.5128) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:36:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][560/625] eta 0:00:38 lr 0.000296 wd 0.0500 time 0.5718 (0.5855) data time 0.0008 (0.0016) model time 0.5710 (0.5849) loss 7.9779 (7.2950) grad_norm 2.6580 (2.5228) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:37:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][570/625] eta 0:00:32 lr 0.000296 wd 0.0500 time 0.5685 (0.5856) data time 0.0006 (0.0016) model time 0.5678 (0.5850) loss 7.3904 (7.3010) grad_norm 2.0493 (2.5226) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:37:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][580/625] eta 0:00:26 lr 0.000296 wd 0.0500 time 0.7034 (0.5859) data time 0.0006 (0.0016) model time 0.7028 (0.5854) loss 7.3672 (7.3025) grad_norm 2.1021 (2.5222) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:37:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][590/625] eta 0:00:20 lr 0.000296 wd 0.0500 time 0.7042 (0.5868) data time 0.0007 (0.0016) model time 0.7035 (0.5863) loss 7.8102 (7.3048) grad_norm 1.7212 (2.5212) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:37:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][600/625] eta 0:00:14 lr 0.000296 wd 0.0500 time 0.7346 (0.5881) data time 0.0007 (0.0016) model time 0.7340 (0.5877) loss 7.0905 (7.3004) grad_norm 2.7824 (2.5146) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:37:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][610/625] eta 0:00:08 lr 0.000296 wd 0.0500 time 0.5725 (0.5879) data time 0.0006 (0.0015) model time 0.5720 (0.5875) loss 7.0259 (7.3022) grad_norm 2.4303 (2.5192) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:37:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [208/300][620/625] eta 0:00:02 lr 0.000296 wd 0.0500 time 0.6938 (0.5881) data time 0.0004 (0.0015) model time 0.6934 (0.5877) loss 5.8346 (7.2977) grad_norm 2.2703 (2.5255) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:37:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 208 training takes 0:06:07 +[2024-07-25 11:37:37 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 11:37:39 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 11:37:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.485 (0.485) Loss 0.5063 (0.5063) Acc@1 90.430 (90.430) Acc@5 98.828 (98.828) Mem 22339MB +[2024-07-25 11:37:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.7524 (0.6204) Acc@1 83.301 (87.491) Acc@5 96.924 (97.949) Mem 22339MB +[2024-07-25 11:37:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.8823 (0.7154) Acc@1 78.418 (84.552) Acc@5 96.094 (97.105) Mem 22339MB +[2024-07-25 11:37:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.149 Acc@5 97.111 +[2024-07-25 11:37:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.1% +[2024-07-25 11:37:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 84.15% +[2024-07-25 11:37:42 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 11:37:44 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 11:37:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.480 (0.480) Loss 0.5049 (0.5049) Acc@1 90.332 (90.332) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-25 11:37:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.7500 (0.6212) Acc@1 83.350 (87.549) Acc@5 96.631 (97.994) Mem 22339MB +[2024-07-25 11:37:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.8657 (0.7119) Acc@1 79.053 (84.598) Acc@5 95.947 (97.131) Mem 22339MB +[2024-07-25 11:37:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.217 Acc@5 97.109 +[2024-07-25 11:37:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.2% +[2024-07-25 11:37:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.22% +[2024-07-25 11:37:47 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 11:37:49 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 11:37:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][0/625] eta 0:08:45 lr 0.000296 wd 0.0500 time 0.8403 (0.8403) data time 0.3229 (0.3229) model time 0.0000 (0.0000) loss 7.9460 (7.9460) grad_norm 2.0099 (2.0099) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:37:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][10/625] eta 0:06:14 lr 0.000296 wd 0.0500 time 0.5709 (0.6090) data time 0.0006 (0.0301) model time 0.0000 (0.0000) loss 7.9474 (7.4726) grad_norm 1.9927 (2.6266) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:38:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][20/625] eta 0:05:59 lr 0.000295 wd 0.0500 time 0.5714 (0.5936) data time 0.0006 (0.0162) model time 0.0000 (0.0000) loss 7.7330 (7.3938) grad_norm 2.0704 (2.4700) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:38:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][30/625] eta 0:05:49 lr 0.000295 wd 0.0500 time 0.5679 (0.5871) data time 0.0008 (0.0113) model time 0.0000 (0.0000) loss 8.7468 (7.2635) grad_norm 1.9115 (2.4763) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:38:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][40/625] eta 0:05:41 lr 0.000295 wd 0.0500 time 0.5733 (0.5842) data time 0.0009 (0.0087) model time 0.0000 (0.0000) loss 6.8682 (7.1374) grad_norm 1.7815 (2.5965) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:38:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][50/625] eta 0:05:34 lr 0.000295 wd 0.0500 time 0.5753 (0.5823) data time 0.0008 (0.0073) model time 0.0000 (0.0000) loss 6.8134 (7.1669) grad_norm 1.8492 (2.5117) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:38:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][60/625] eta 0:05:28 lr 0.000295 wd 0.0500 time 0.5734 (0.5811) data time 0.0007 (0.0062) model time 0.5727 (0.5736) loss 8.0463 (7.1928) grad_norm 1.6780 (2.4402) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:38:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][70/625] eta 0:05:21 lr 0.000295 wd 0.0500 time 0.5713 (0.5801) data time 0.0006 (0.0055) model time 0.5706 (0.5735) loss 7.2175 (7.1504) grad_norm 2.5003 (2.3952) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:38:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][80/625] eta 0:05:15 lr 0.000295 wd 0.0500 time 0.5834 (0.5796) data time 0.0007 (0.0049) model time 0.5827 (0.5740) loss 8.3246 (7.1830) grad_norm 2.9473 (2.4545) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:38:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][90/625] eta 0:05:09 lr 0.000295 wd 0.0500 time 0.5713 (0.5788) data time 0.0009 (0.0045) model time 0.5704 (0.5735) loss 7.5456 (7.1916) grad_norm 4.1613 (2.6635) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:38:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][100/625] eta 0:05:03 lr 0.000295 wd 0.0500 time 0.5723 (0.5782) data time 0.0006 (0.0041) model time 0.5717 (0.5732) loss 7.8009 (7.2327) grad_norm 2.3976 (2.6991) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:38:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][110/625] eta 0:04:57 lr 0.000295 wd 0.0500 time 0.5684 (0.5777) data time 0.0006 (0.0038) model time 0.5677 (0.5729) loss 8.3010 (7.2375) grad_norm 1.7854 (2.6390) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:38:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][120/625] eta 0:04:51 lr 0.000295 wd 0.0500 time 0.5731 (0.5774) data time 0.0008 (0.0036) model time 0.5723 (0.5729) loss 6.7555 (7.2264) grad_norm 1.6196 (2.8146) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:39:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][130/625] eta 0:04:45 lr 0.000294 wd 0.0500 time 0.5671 (0.5771) data time 0.0008 (0.0034) model time 0.5663 (0.5729) loss 5.9206 (7.2258) grad_norm 2.1402 (2.7873) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:39:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][140/625] eta 0:04:39 lr 0.000294 wd 0.0500 time 0.5720 (0.5770) data time 0.0008 (0.0032) model time 0.5712 (0.5731) loss 6.5154 (7.2354) grad_norm 2.1461 (2.7701) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:39:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][150/625] eta 0:04:33 lr 0.000294 wd 0.0500 time 0.5731 (0.5768) data time 0.0006 (0.0031) model time 0.5725 (0.5730) loss 7.3233 (7.2550) grad_norm 1.6811 (2.7307) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:39:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][160/625] eta 0:04:28 lr 0.000294 wd 0.0500 time 0.6175 (0.5778) data time 0.0006 (0.0029) model time 0.6169 (0.5748) loss 7.5854 (7.2454) grad_norm 1.9115 (2.7013) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:39:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][170/625] eta 0:04:23 lr 0.000294 wd 0.0500 time 0.5694 (0.5782) data time 0.0006 (0.0028) model time 0.5688 (0.5756) loss 6.4887 (7.2376) grad_norm 2.3966 (2.6793) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:39:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][180/625] eta 0:04:18 lr 0.000294 wd 0.0500 time 0.7134 (0.5803) data time 0.0006 (0.0027) model time 0.7127 (0.5786) loss 6.7479 (7.2281) grad_norm 2.1028 (2.6634) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:39:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][190/625] eta 0:04:14 lr 0.000294 wd 0.0500 time 0.5703 (0.5840) data time 0.0006 (0.0026) model time 0.5697 (0.5837) loss 7.7644 (7.2444) grad_norm 2.2344 (2.6520) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:39:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][200/625] eta 0:04:09 lr 0.000294 wd 0.0500 time 0.5719 (0.5866) data time 0.0008 (0.0025) model time 0.5711 (0.5872) loss 6.5547 (7.2471) grad_norm 2.4014 (2.6490) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:39:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][210/625] eta 0:04:03 lr 0.000294 wd 0.0500 time 0.5681 (0.5867) data time 0.0007 (0.0024) model time 0.5675 (0.5872) loss 7.8004 (7.2501) grad_norm 2.0670 (2.6693) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:39:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][220/625] eta 0:03:58 lr 0.000294 wd 0.0500 time 0.6066 (0.5883) data time 0.0006 (0.0024) model time 0.6060 (0.5892) loss 7.8549 (7.2526) grad_norm 1.6632 (2.6450) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:40:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][230/625] eta 0:03:52 lr 0.000294 wd 0.0500 time 0.5724 (0.5884) data time 0.0006 (0.0023) model time 0.5718 (0.5892) loss 6.8402 (7.2702) grad_norm 3.4032 (2.6469) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:40:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][240/625] eta 0:03:46 lr 0.000293 wd 0.0500 time 0.5695 (0.5878) data time 0.0007 (0.0022) model time 0.5688 (0.5884) loss 8.1884 (7.2598) grad_norm 3.3174 (2.6384) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:40:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][250/625] eta 0:03:40 lr 0.000293 wd 0.0500 time 0.5709 (0.5873) data time 0.0006 (0.0022) model time 0.5704 (0.5877) loss 6.3805 (7.2462) grad_norm 2.0151 (2.6545) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:40:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][260/625] eta 0:03:34 lr 0.000293 wd 0.0500 time 0.5764 (0.5868) data time 0.0006 (0.0021) model time 0.5758 (0.5870) loss 7.9885 (7.2616) grad_norm 4.1794 (2.6567) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:40:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][270/625] eta 0:03:28 lr 0.000293 wd 0.0500 time 0.5741 (0.5863) data time 0.0006 (0.0021) model time 0.5735 (0.5864) loss 6.8174 (7.2740) grad_norm 2.0891 (2.6798) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:40:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][280/625] eta 0:03:22 lr 0.000293 wd 0.0500 time 0.5627 (0.5859) data time 0.0008 (0.0020) model time 0.5618 (0.5859) loss 6.4089 (7.2799) grad_norm 2.5865 (2.6770) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:40:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][290/625] eta 0:03:16 lr 0.000293 wd 0.0500 time 0.5743 (0.5856) data time 0.0006 (0.0020) model time 0.5736 (0.5854) loss 6.6473 (7.2917) grad_norm 2.4777 (2.6653) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:40:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][300/625] eta 0:03:10 lr 0.000293 wd 0.0500 time 0.5719 (0.5852) data time 0.0006 (0.0020) model time 0.5713 (0.5849) loss 8.3571 (7.3025) grad_norm 2.1852 (2.6596) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:40:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][310/625] eta 0:03:04 lr 0.000293 wd 0.0500 time 0.5709 (0.5849) data time 0.0008 (0.0019) model time 0.5701 (0.5845) loss 6.9625 (7.2983) grad_norm 3.9730 (2.6630) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:40:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][320/625] eta 0:02:58 lr 0.000293 wd 0.0500 time 0.5729 (0.5845) data time 0.0008 (0.0019) model time 0.5721 (0.5841) loss 6.6039 (7.3062) grad_norm 2.3070 (2.6701) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:41:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][330/625] eta 0:02:52 lr 0.000293 wd 0.0500 time 0.5726 (0.5842) data time 0.0007 (0.0019) model time 0.5719 (0.5837) loss 7.0560 (7.2950) grad_norm 3.3357 (2.6742) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:41:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][340/625] eta 0:02:46 lr 0.000293 wd 0.0500 time 0.5730 (0.5839) data time 0.0006 (0.0018) model time 0.5723 (0.5833) loss 8.7961 (7.3001) grad_norm 2.3527 (2.6635) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:41:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][350/625] eta 0:02:40 lr 0.000292 wd 0.0500 time 0.5709 (0.5837) data time 0.0006 (0.0018) model time 0.5703 (0.5831) loss 7.2706 (7.3132) grad_norm 1.5088 (2.6485) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:41:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][360/625] eta 0:02:34 lr 0.000292 wd 0.0500 time 0.5718 (0.5835) data time 0.0006 (0.0018) model time 0.5712 (0.5828) loss 7.6328 (7.3211) grad_norm 2.5749 (2.6347) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:41:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][370/625] eta 0:02:28 lr 0.000292 wd 0.0500 time 0.5719 (0.5832) data time 0.0006 (0.0017) model time 0.5713 (0.5825) loss 6.5469 (7.3227) grad_norm 1.7588 (2.6419) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:41:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][380/625] eta 0:02:22 lr 0.000292 wd 0.0500 time 0.5727 (0.5833) data time 0.0007 (0.0017) model time 0.5720 (0.5826) loss 5.6917 (7.3281) grad_norm 10.8847 (2.6554) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:41:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][390/625] eta 0:02:17 lr 0.000292 wd 0.0500 time 0.5689 (0.5836) data time 0.0008 (0.0017) model time 0.5681 (0.5829) loss 8.2725 (7.3396) grad_norm 2.8530 (2.7000) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:41:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][400/625] eta 0:02:11 lr 0.000292 wd 0.0500 time 0.7525 (0.5846) data time 0.0008 (0.0017) model time 0.7517 (0.5841) loss 8.1375 (7.3367) grad_norm 2.6038 (2.7083) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:41:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][410/625] eta 0:02:05 lr 0.000292 wd 0.0500 time 0.5703 (0.5858) data time 0.0008 (0.0017) model time 0.5695 (0.5854) loss 7.7182 (7.3343) grad_norm 1.6699 (2.6989) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:41:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][420/625] eta 0:02:00 lr 0.000292 wd 0.0500 time 0.5679 (0.5875) data time 0.0006 (0.0016) model time 0.5673 (0.5874) loss 7.1858 (7.3362) grad_norm 2.1882 (2.6843) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:42:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][430/625] eta 0:01:54 lr 0.000292 wd 0.0500 time 0.5676 (0.5874) data time 0.0008 (0.0016) model time 0.5668 (0.5872) loss 7.5189 (7.3489) grad_norm 3.4897 (2.6752) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:42:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][440/625] eta 0:01:48 lr 0.000292 wd 0.0500 time 0.6628 (0.5882) data time 0.0006 (0.0016) model time 0.6621 (0.5881) loss 7.7540 (7.3459) grad_norm 2.3876 (2.6745) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:42:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][450/625] eta 0:01:42 lr 0.000292 wd 0.0500 time 0.5733 (0.5881) data time 0.0006 (0.0016) model time 0.5727 (0.5880) loss 7.0449 (7.3417) grad_norm 4.4225 (2.6815) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:42:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][460/625] eta 0:01:36 lr 0.000291 wd 0.0500 time 0.5727 (0.5878) data time 0.0006 (0.0016) model time 0.5720 (0.5876) loss 6.2792 (7.3339) grad_norm 1.7650 (2.6815) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:42:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][470/625] eta 0:01:31 lr 0.000291 wd 0.0500 time 0.5731 (0.5875) data time 0.0008 (0.0016) model time 0.5723 (0.5873) loss 6.0320 (7.3288) grad_norm 1.9823 (2.6704) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:42:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][480/625] eta 0:01:25 lr 0.000291 wd 0.0500 time 0.5741 (0.5873) data time 0.0008 (0.0015) model time 0.5733 (0.5870) loss 8.6447 (7.3321) grad_norm 1.8353 (2.6572) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:42:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][490/625] eta 0:01:19 lr 0.000291 wd 0.0500 time 0.5731 (0.5871) data time 0.0006 (0.0015) model time 0.5725 (0.5868) loss 6.9380 (7.3283) grad_norm 2.0172 (2.6492) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:42:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][500/625] eta 0:01:13 lr 0.000291 wd 0.0500 time 0.5762 (0.5868) data time 0.0007 (0.0015) model time 0.5755 (0.5865) loss 7.4413 (7.3288) grad_norm 2.4015 (2.6477) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:42:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][510/625] eta 0:01:07 lr 0.000291 wd 0.0500 time 0.5754 (0.5866) data time 0.0008 (0.0015) model time 0.5746 (0.5862) loss 7.5930 (7.3271) grad_norm 1.7455 (2.6921) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:42:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][520/625] eta 0:01:01 lr 0.000291 wd 0.0500 time 0.5739 (0.5864) data time 0.0007 (0.0015) model time 0.5732 (0.5860) loss 6.9659 (7.3332) grad_norm 3.1666 (2.6835) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:43:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][530/625] eta 0:00:55 lr 0.000291 wd 0.0500 time 0.5713 (0.5861) data time 0.0008 (0.0015) model time 0.5706 (0.5857) loss 6.6137 (7.3336) grad_norm 2.5764 (2.6853) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:43:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][540/625] eta 0:00:49 lr 0.000291 wd 0.0500 time 0.5731 (0.5859) data time 0.0006 (0.0015) model time 0.5725 (0.5854) loss 7.3186 (7.3363) grad_norm 4.3294 (2.6891) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:43:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][550/625] eta 0:00:43 lr 0.000291 wd 0.0500 time 0.5714 (0.5857) data time 0.0009 (0.0015) model time 0.5705 (0.5852) loss 9.0093 (7.3353) grad_norm 3.1903 (2.6827) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:43:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][560/625] eta 0:00:38 lr 0.000291 wd 0.0500 time 0.5712 (0.5855) data time 0.0006 (0.0014) model time 0.5706 (0.5850) loss 7.4051 (7.3368) grad_norm 2.3463 (2.6731) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:43:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][570/625] eta 0:00:32 lr 0.000290 wd 0.0500 time 0.5710 (0.5853) data time 0.0006 (0.0014) model time 0.5704 (0.5848) loss 7.8669 (7.3406) grad_norm 2.8831 (2.7002) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:43:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][580/625] eta 0:00:26 lr 0.000290 wd 0.0500 time 0.5745 (0.5852) data time 0.0008 (0.0014) model time 0.5738 (0.5846) loss 8.2552 (7.3397) grad_norm 3.2716 (2.6985) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:43:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][590/625] eta 0:00:20 lr 0.000290 wd 0.0500 time 0.5733 (0.5850) data time 0.0008 (0.0014) model time 0.5725 (0.5844) loss 7.6577 (7.3354) grad_norm 1.8223 (2.6897) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:43:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][600/625] eta 0:00:14 lr 0.000290 wd 0.0500 time 0.5723 (0.5851) data time 0.0006 (0.0014) model time 0.5717 (0.5845) loss 7.6479 (7.3297) grad_norm 2.3043 (2.6894) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:43:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][610/625] eta 0:00:08 lr 0.000290 wd 0.0500 time 0.5595 (0.5852) data time 0.0004 (0.0014) model time 0.5590 (0.5846) loss 7.6737 (7.3301) grad_norm 2.4966 (2.6893) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:43:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [209/300][620/625] eta 0:00:02 lr 0.000290 wd 0.0500 time 0.7039 (0.5856) data time 0.0004 (0.0014) model time 0.7035 (0.5851) loss 6.8067 (7.3291) grad_norm 1.8943 (2.6926) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:43:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 209 training takes 0:06:06 +[2024-07-25 11:43:55 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 11:43:57 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 11:43:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.474 (0.474) Loss 0.4971 (0.4971) Acc@1 90.283 (90.283) Acc@5 98.730 (98.730) Mem 22339MB +[2024-07-25 11:43:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.7642 (0.6187) Acc@1 82.812 (87.300) Acc@5 96.631 (97.918) Mem 22339MB +[2024-07-25 11:44:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.8628 (0.7134) Acc@1 79.492 (84.521) Acc@5 96.045 (97.031) Mem 22339MB +[2024-07-25 11:44:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.221 Acc@5 97.033 +[2024-07-25 11:44:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.2% +[2024-07-25 11:44:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 84.22% +[2024-07-25 11:44:00 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 11:44:03 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 11:44:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.484 (0.484) Loss 0.5049 (0.5049) Acc@1 90.381 (90.381) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-25 11:44:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.7495 (0.6211) Acc@1 83.398 (87.553) Acc@5 96.729 (98.016) Mem 22339MB +[2024-07-25 11:44:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8657 (0.7115) Acc@1 78.955 (84.594) Acc@5 95.947 (97.147) Mem 22339MB +[2024-07-25 11:44:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.209 Acc@5 97.123 +[2024-07-25 11:44:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.2% +[2024-07-25 11:44:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][0/625] eta 0:14:31 lr 0.000290 wd 0.0500 time 1.3947 (1.3947) data time 0.4764 (0.4764) model time 0.0000 (0.0000) loss 7.2091 (7.2091) grad_norm 2.0482 (2.0482) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:44:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][10/625] eta 0:07:36 lr 0.000290 wd 0.0500 time 0.7588 (0.7422) data time 0.0008 (0.0441) model time 0.0000 (0.0000) loss 9.1112 (7.1813) grad_norm 8.1144 (3.1158) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:44:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][20/625] eta 0:06:46 lr 0.000290 wd 0.0500 time 0.5824 (0.6724) data time 0.0008 (0.0235) model time 0.0000 (0.0000) loss 7.2710 (7.1818) grad_norm 2.8647 (3.0078) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:44:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][30/625] eta 0:06:25 lr 0.000290 wd 0.0500 time 0.5749 (0.6487) data time 0.0007 (0.0162) model time 0.0000 (0.0000) loss 7.3936 (7.1711) grad_norm 2.1595 (2.7391) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:44:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][40/625] eta 0:06:12 lr 0.000290 wd 0.0500 time 0.5725 (0.6360) data time 0.0006 (0.0125) model time 0.0000 (0.0000) loss 7.7874 (7.2074) grad_norm 2.3936 (2.7037) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:44:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][50/625] eta 0:05:58 lr 0.000290 wd 0.0500 time 0.5757 (0.6242) data time 0.0008 (0.0102) model time 0.0000 (0.0000) loss 7.8620 (7.2468) grad_norm 1.7892 (2.6645) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:44:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][60/625] eta 0:05:48 lr 0.000289 wd 0.0500 time 0.5797 (0.6160) data time 0.0008 (0.0086) model time 0.5789 (0.5731) loss 6.9007 (7.2577) grad_norm 2.6909 (2.6466) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:44:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][70/625] eta 0:05:38 lr 0.000289 wd 0.0500 time 0.5760 (0.6101) data time 0.0008 (0.0075) model time 0.5752 (0.5732) loss 8.0252 (7.2461) grad_norm 3.0767 (2.6372) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:44:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][80/625] eta 0:05:30 lr 0.000289 wd 0.0500 time 0.5778 (0.6059) data time 0.0006 (0.0067) model time 0.5772 (0.5740) loss 6.5126 (7.2060) grad_norm 2.3255 (2.6358) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:45:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][90/625] eta 0:05:22 lr 0.000289 wd 0.0500 time 0.5759 (0.6026) data time 0.0008 (0.0061) model time 0.5751 (0.5741) loss 7.5542 (7.2306) grad_norm 2.5017 (2.6253) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:45:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][100/625] eta 0:05:14 lr 0.000289 wd 0.0500 time 0.5774 (0.5997) data time 0.0007 (0.0056) model time 0.5767 (0.5739) loss 7.3608 (7.2059) grad_norm 2.1003 (2.5781) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:45:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][110/625] eta 0:05:07 lr 0.000289 wd 0.0500 time 0.5742 (0.5975) data time 0.0010 (0.0051) model time 0.5732 (0.5740) loss 7.5432 (7.2282) grad_norm 1.9512 (2.5363) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:45:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][120/625] eta 0:05:00 lr 0.000289 wd 0.0500 time 0.5829 (0.5956) data time 0.0009 (0.0048) model time 0.5820 (0.5739) loss 7.9674 (7.2422) grad_norm 2.6640 (2.5175) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:45:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][130/625] eta 0:04:54 lr 0.000289 wd 0.0500 time 0.5764 (0.5941) data time 0.0007 (0.0045) model time 0.5757 (0.5740) loss 6.9568 (7.2627) grad_norm 2.8307 (2.4838) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:45:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][140/625] eta 0:04:47 lr 0.000289 wd 0.0500 time 0.5833 (0.5928) data time 0.0009 (0.0042) model time 0.5824 (0.5740) loss 7.9053 (7.2624) grad_norm 2.0526 (2.4850) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:45:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][150/625] eta 0:04:40 lr 0.000289 wd 0.0500 time 0.5777 (0.5916) data time 0.0006 (0.0040) model time 0.5771 (0.5740) loss 7.5477 (7.2592) grad_norm 2.6536 (2.4696) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:45:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][160/625] eta 0:04:34 lr 0.000289 wd 0.0500 time 0.5808 (0.5906) data time 0.0006 (0.0038) model time 0.5802 (0.5741) loss 7.9480 (7.2569) grad_norm 3.0172 (2.4549) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 11:45:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][170/625] eta 0:04:28 lr 0.000288 wd 0.0500 time 0.5784 (0.5897) data time 0.0008 (0.0037) model time 0.5776 (0.5741) loss 8.2994 (7.2791) grad_norm 2.6689 (2.4469) loss_scale 1024.0000 (523.9766) mem 22339MB +[2024-07-25 11:45:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][180/625] eta 0:04:22 lr 0.000288 wd 0.0500 time 0.5745 (0.5888) data time 0.0009 (0.0035) model time 0.5737 (0.5741) loss 7.5867 (7.2784) grad_norm 1.9583 (2.4343) loss_scale 1024.0000 (551.6022) mem 22339MB +[2024-07-25 11:45:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][190/625] eta 0:04:15 lr 0.000288 wd 0.0500 time 0.5738 (0.5881) data time 0.0008 (0.0034) model time 0.5731 (0.5741) loss 7.3214 (7.2794) grad_norm 1.7756 (2.4181) loss_scale 1024.0000 (576.3351) mem 22339MB +[2024-07-25 11:46:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][200/625] eta 0:04:10 lr 0.000288 wd 0.0500 time 0.6269 (0.5892) data time 0.0006 (0.0032) model time 0.6263 (0.5765) loss 8.9230 (7.3037) grad_norm 2.0140 (2.3982) loss_scale 1024.0000 (598.6070) mem 22339MB +[2024-07-25 11:46:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][210/625] eta 0:04:04 lr 0.000288 wd 0.0500 time 0.5741 (0.5899) data time 0.0006 (0.0031) model time 0.5735 (0.5780) loss 6.6936 (7.2974) grad_norm 3.8858 (2.3848) loss_scale 1024.0000 (618.7678) mem 22339MB +[2024-07-25 11:46:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][220/625] eta 0:03:59 lr 0.000288 wd 0.0500 time 0.7289 (0.5919) data time 0.0006 (0.0030) model time 0.7283 (0.5813) loss 6.7275 (7.2964) grad_norm 2.0947 (2.3803) loss_scale 1024.0000 (637.1041) mem 22339MB +[2024-07-25 11:46:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][230/625] eta 0:03:54 lr 0.000288 wd 0.0500 time 0.5731 (0.5932) data time 0.0006 (0.0029) model time 0.5726 (0.5835) loss 6.9444 (7.3123) grad_norm 2.5082 (2.3658) loss_scale 1024.0000 (653.8528) mem 22339MB +[2024-07-25 11:46:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][240/625] eta 0:03:48 lr 0.000288 wd 0.0500 time 0.5750 (0.5945) data time 0.0006 (0.0029) model time 0.5744 (0.5857) loss 5.5779 (7.3039) grad_norm 1.7754 (2.3512) loss_scale 1024.0000 (669.2116) mem 22339MB +[2024-07-25 11:46:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][250/625] eta 0:03:43 lr 0.000288 wd 0.0500 time 0.7022 (0.5949) data time 0.0008 (0.0028) model time 0.7014 (0.5865) loss 7.2253 (7.3097) grad_norm 3.2020 (2.3632) loss_scale 1024.0000 (683.3466) mem 22339MB +[2024-07-25 11:46:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][260/625] eta 0:03:37 lr 0.000288 wd 0.0500 time 0.5784 (0.5950) data time 0.0007 (0.0027) model time 0.5777 (0.5870) loss 6.8566 (7.3022) grad_norm 2.0003 (2.3621) loss_scale 1024.0000 (696.3985) mem 22339MB +[2024-07-25 11:46:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][270/625] eta 0:03:30 lr 0.000288 wd 0.0500 time 0.5739 (0.5942) data time 0.0008 (0.0026) model time 0.5731 (0.5864) loss 8.0981 (7.3097) grad_norm 2.4618 (2.3554) loss_scale 1024.0000 (708.4871) mem 22339MB +[2024-07-25 11:46:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][280/625] eta 0:03:24 lr 0.000287 wd 0.0500 time 0.5721 (0.5935) data time 0.0009 (0.0026) model time 0.5712 (0.5858) loss 7.9073 (7.3171) grad_norm 1.9899 (2.3432) loss_scale 1024.0000 (719.7153) mem 22339MB +[2024-07-25 11:46:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][290/625] eta 0:03:18 lr 0.000287 wd 0.0500 time 0.5812 (0.5929) data time 0.0008 (0.0025) model time 0.5804 (0.5853) loss 6.4961 (7.3070) grad_norm 1.9609 (2.3367) loss_scale 1024.0000 (730.1718) mem 22339MB +[2024-07-25 11:47:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][300/625] eta 0:03:12 lr 0.000287 wd 0.0500 time 0.5739 (0.5922) data time 0.0006 (0.0025) model time 0.5733 (0.5848) loss 8.3832 (7.3056) grad_norm 1.7291 (2.3379) loss_scale 1024.0000 (739.9336) mem 22339MB +[2024-07-25 11:47:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][310/625] eta 0:03:06 lr 0.000287 wd 0.0500 time 0.5819 (0.5917) data time 0.0008 (0.0024) model time 0.5811 (0.5844) loss 7.7171 (7.3198) grad_norm 4.5307 (2.3564) loss_scale 1024.0000 (749.0675) mem 22339MB +[2024-07-25 11:47:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][320/625] eta 0:03:00 lr 0.000287 wd 0.0500 time 0.5757 (0.5911) data time 0.0006 (0.0024) model time 0.5751 (0.5840) loss 6.7632 (7.3198) grad_norm 4.3587 (2.3787) loss_scale 1024.0000 (757.6324) mem 22339MB +[2024-07-25 11:47:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][330/625] eta 0:02:54 lr 0.000287 wd 0.0500 time 0.5719 (0.5906) data time 0.0006 (0.0023) model time 0.5713 (0.5836) loss 7.7767 (7.3184) grad_norm 2.9189 (2.4044) loss_scale 1024.0000 (765.6798) mem 22339MB +[2024-07-25 11:47:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][340/625] eta 0:02:48 lr 0.000287 wd 0.0500 time 0.5735 (0.5902) data time 0.0006 (0.0023) model time 0.5729 (0.5833) loss 6.3408 (7.3204) grad_norm 2.9326 (2.4119) loss_scale 1024.0000 (773.2551) mem 22339MB +[2024-07-25 11:47:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][350/625] eta 0:02:42 lr 0.000287 wd 0.0500 time 0.5708 (0.5897) data time 0.0007 (0.0022) model time 0.5702 (0.5829) loss 6.9988 (7.3194) grad_norm 2.2776 (2.4265) loss_scale 1024.0000 (780.3989) mem 22339MB +[2024-07-25 11:47:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][360/625] eta 0:02:36 lr 0.000287 wd 0.0500 time 0.5714 (0.5892) data time 0.0006 (0.0022) model time 0.5708 (0.5826) loss 7.7160 (7.3075) grad_norm 2.5144 (2.4257) loss_scale 1024.0000 (787.1468) mem 22339MB +[2024-07-25 11:47:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][370/625] eta 0:02:30 lr 0.000287 wd 0.0500 time 0.5789 (0.5888) data time 0.0008 (0.0022) model time 0.5781 (0.5823) loss 7.6298 (7.3037) grad_norm 2.1524 (2.4243) loss_scale 1024.0000 (793.5310) mem 22339MB +[2024-07-25 11:47:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][380/625] eta 0:02:24 lr 0.000287 wd 0.0500 time 0.5764 (0.5884) data time 0.0007 (0.0021) model time 0.5757 (0.5820) loss 6.4540 (7.3104) grad_norm 1.5848 (2.4161) loss_scale 1024.0000 (799.5801) mem 22339MB +[2024-07-25 11:47:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][390/625] eta 0:02:18 lr 0.000286 wd 0.0500 time 0.5758 (0.5881) data time 0.0008 (0.0021) model time 0.5750 (0.5817) loss 7.1707 (7.3034) grad_norm 1.8168 (2.4231) loss_scale 1024.0000 (805.3197) mem 22339MB +[2024-07-25 11:48:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][400/625] eta 0:02:12 lr 0.000286 wd 0.0500 time 0.5749 (0.5877) data time 0.0010 (0.0021) model time 0.5739 (0.5815) loss 7.2178 (7.3052) grad_norm 1.6282 (2.4234) loss_scale 1024.0000 (810.7731) mem 22339MB +[2024-07-25 11:48:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][410/625] eta 0:02:06 lr 0.000286 wd 0.0500 time 0.5757 (0.5874) data time 0.0006 (0.0020) model time 0.5750 (0.5813) loss 6.2276 (7.3083) grad_norm 2.8106 (2.4232) loss_scale 1024.0000 (815.9611) mem 22339MB +[2024-07-25 11:48:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][420/625] eta 0:02:00 lr 0.000286 wd 0.0500 time 0.7401 (0.5883) data time 0.0006 (0.0020) model time 0.7395 (0.5825) loss 8.0080 (7.3135) grad_norm 2.5415 (2.4184) loss_scale 1024.0000 (820.9026) mem 22339MB +[2024-07-25 11:48:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][430/625] eta 0:01:54 lr 0.000286 wd 0.0500 time 0.7705 (0.5886) data time 0.0006 (0.0020) model time 0.7699 (0.5829) loss 7.0192 (7.3139) grad_norm 2.5658 (2.4182) loss_scale 1024.0000 (825.6148) mem 22339MB +[2024-07-25 11:48:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][440/625] eta 0:01:48 lr 0.000286 wd 0.0500 time 0.5755 (0.5889) data time 0.0007 (0.0020) model time 0.5749 (0.5834) loss 5.8399 (7.3175) grad_norm 1.9064 (2.4225) loss_scale 1024.0000 (830.1134) mem 22339MB +[2024-07-25 11:48:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][450/625] eta 0:01:43 lr 0.000286 wd 0.0500 time 0.7350 (0.5900) data time 0.0008 (0.0019) model time 0.7342 (0.5847) loss 8.7961 (7.3150) grad_norm 1.9467 (2.4160) loss_scale 1024.0000 (834.4124) mem 22339MB +[2024-07-25 11:48:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][460/625] eta 0:01:37 lr 0.000286 wd 0.0500 time 0.5750 (0.5907) data time 0.0006 (0.0019) model time 0.5744 (0.5857) loss 8.3548 (7.3144) grad_norm 2.0599 (2.4112) loss_scale 1024.0000 (838.5249) mem 22339MB +[2024-07-25 11:48:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][470/625] eta 0:01:31 lr 0.000286 wd 0.0500 time 0.7605 (0.5910) data time 0.0008 (0.0019) model time 0.7597 (0.5861) loss 6.5357 (7.3147) grad_norm 9.3341 (2.4253) loss_scale 1024.0000 (842.4628) mem 22339MB +[2024-07-25 11:48:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][480/625] eta 0:01:25 lr 0.000286 wd 0.0500 time 0.5783 (0.5914) data time 0.0007 (0.0019) model time 0.5776 (0.5867) loss 7.1679 (7.3113) grad_norm 4.7928 (2.4533) loss_scale 1024.0000 (846.2370) mem 22339MB +[2024-07-25 11:48:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][490/625] eta 0:01:19 lr 0.000286 wd 0.0500 time 0.5873 (0.5911) data time 0.0006 (0.0018) model time 0.5867 (0.5864) loss 7.0742 (7.3108) grad_norm 2.9618 (2.4661) loss_scale 1024.0000 (849.8574) mem 22339MB +[2024-07-25 11:49:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][500/625] eta 0:01:13 lr 0.000285 wd 0.0500 time 0.5736 (0.5907) data time 0.0006 (0.0018) model time 0.5730 (0.5861) loss 6.0891 (7.3094) grad_norm 1.9133 (2.4654) loss_scale 1024.0000 (853.3333) mem 22339MB +[2024-07-25 11:49:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][510/625] eta 0:01:07 lr 0.000285 wd 0.0500 time 0.5747 (0.5904) data time 0.0006 (0.0018) model time 0.5741 (0.5858) loss 7.9696 (7.3148) grad_norm 4.5023 (2.4722) loss_scale 1024.0000 (856.6732) mem 22339MB +[2024-07-25 11:49:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][520/625] eta 0:01:01 lr 0.000285 wd 0.0500 time 0.5769 (0.5901) data time 0.0006 (0.0018) model time 0.5763 (0.5855) loss 7.3423 (7.3153) grad_norm 2.6766 (2.4866) loss_scale 1024.0000 (859.8848) mem 22339MB +[2024-07-25 11:49:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][530/625] eta 0:00:56 lr 0.000285 wd 0.0500 time 0.5760 (0.5898) data time 0.0006 (0.0018) model time 0.5754 (0.5853) loss 6.1640 (7.3211) grad_norm 2.1313 (2.4898) loss_scale 1024.0000 (862.9755) mem 22339MB +[2024-07-25 11:49:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][540/625] eta 0:00:50 lr 0.000285 wd 0.0500 time 0.5727 (0.5895) data time 0.0008 (0.0018) model time 0.5719 (0.5850) loss 6.6991 (7.3184) grad_norm 3.2911 (2.4948) loss_scale 1024.0000 (865.9519) mem 22339MB +[2024-07-25 11:49:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][550/625] eta 0:00:44 lr 0.000285 wd 0.0500 time 0.5731 (0.5892) data time 0.0009 (0.0017) model time 0.5722 (0.5848) loss 6.8659 (7.3149) grad_norm 2.7978 (2.4973) loss_scale 1024.0000 (868.8203) mem 22339MB +[2024-07-25 11:49:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][560/625] eta 0:00:38 lr 0.000285 wd 0.0500 time 0.5754 (0.5890) data time 0.0006 (0.0017) model time 0.5749 (0.5845) loss 7.8318 (7.3161) grad_norm 1.6700 (2.4946) loss_scale 1024.0000 (871.5865) mem 22339MB +[2024-07-25 11:49:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][570/625] eta 0:00:32 lr 0.000285 wd 0.0500 time 0.5816 (0.5887) data time 0.0008 (0.0017) model time 0.5808 (0.5843) loss 6.7486 (7.3113) grad_norm 1.7608 (2.5001) loss_scale 1024.0000 (874.2557) mem 22339MB +[2024-07-25 11:49:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][580/625] eta 0:00:26 lr 0.000285 wd 0.0500 time 0.5775 (0.5885) data time 0.0006 (0.0017) model time 0.5769 (0.5841) loss 7.5632 (7.3157) grad_norm 5.0606 (2.5192) loss_scale 1024.0000 (876.8330) mem 22339MB +[2024-07-25 11:49:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][590/625] eta 0:00:20 lr 0.000285 wd 0.0500 time 0.5755 (0.5883) data time 0.0006 (0.0017) model time 0.5748 (0.5840) loss 6.5160 (7.3193) grad_norm 2.8025 (2.5212) loss_scale 1024.0000 (879.3232) mem 22339MB +[2024-07-25 11:50:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][600/625] eta 0:00:14 lr 0.000285 wd 0.0500 time 0.5741 (0.5880) data time 0.0006 (0.0017) model time 0.5735 (0.5838) loss 7.4803 (7.3191) grad_norm 1.9930 (2.5314) loss_scale 1024.0000 (881.7304) mem 22339MB +[2024-07-25 11:50:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][610/625] eta 0:00:08 lr 0.000284 wd 0.0500 time 0.5738 (0.5878) data time 0.0006 (0.0017) model time 0.5732 (0.5836) loss 7.7236 (7.3232) grad_norm 1.5944 (2.5319) loss_scale 1024.0000 (884.0589) mem 22339MB +[2024-07-25 11:50:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [210/300][620/625] eta 0:00:02 lr 0.000284 wd 0.0500 time 0.5762 (0.5876) data time 0.0005 (0.0016) model time 0.5757 (0.5835) loss 8.3234 (7.3240) grad_norm 1.7580 (2.5243) loss_scale 1024.0000 (886.3124) mem 22339MB +[2024-07-25 11:50:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 210 training takes 0:06:07 +[2024-07-25 11:50:14 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 11:50:16 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 11:50:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.480 (0.480) Loss 0.5029 (0.5029) Acc@1 89.844 (89.844) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 11:50:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.7686 (0.6199) Acc@1 82.715 (87.376) Acc@5 96.826 (98.029) Mem 22339MB +[2024-07-25 11:50:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.8740 (0.7170) Acc@1 79.395 (84.515) Acc@5 95.996 (97.070) Mem 22339MB +[2024-07-25 11:50:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.205 Acc@5 97.083 +[2024-07-25 11:50:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.2% +[2024-07-25 11:50:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.861 (0.861) Loss 0.5049 (0.5049) Acc@1 90.430 (90.430) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-25 11:50:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.193) Loss 0.7500 (0.6212) Acc@1 83.398 (87.562) Acc@5 96.777 (98.025) Mem 22339MB +[2024-07-25 11:50:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.161) Loss 0.8652 (0.7115) Acc@1 78.906 (84.635) Acc@5 95.947 (97.145) Mem 22339MB +[2024-07-25 11:50:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.251 Acc@5 97.123 +[2024-07-25 11:50:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.3% +[2024-07-25 11:50:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.25% +[2024-07-25 11:50:23 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 11:50:25 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 11:50:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][0/625] eta 0:09:22 lr 0.000284 wd 0.0500 time 0.9006 (0.9006) data time 0.3820 (0.3820) model time 0.0000 (0.0000) loss 6.0670 (6.0670) grad_norm 1.9087 (1.9087) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:50:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][10/625] eta 0:06:20 lr 0.000284 wd 0.0500 time 0.5690 (0.6185) data time 0.0008 (0.0355) model time 0.0000 (0.0000) loss 6.4521 (7.2242) grad_norm 2.5833 (2.1315) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:50:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][20/625] eta 0:06:09 lr 0.000284 wd 0.0500 time 0.5729 (0.6111) data time 0.0008 (0.0190) model time 0.0000 (0.0000) loss 7.3859 (7.2439) grad_norm 2.5133 (2.2119) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:50:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][30/625] eta 0:06:02 lr 0.000284 wd 0.0500 time 0.7529 (0.6096) data time 0.0008 (0.0132) model time 0.0000 (0.0000) loss 8.1383 (7.2731) grad_norm 2.4430 (2.3056) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:50:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][40/625] eta 0:06:01 lr 0.000284 wd 0.0500 time 0.7485 (0.6178) data time 0.0007 (0.0102) model time 0.0000 (0.0000) loss 7.3667 (7.3556) grad_norm 2.5488 (2.3321) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:50:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][50/625] eta 0:05:53 lr 0.000284 wd 0.0500 time 0.5696 (0.6156) data time 0.0006 (0.0084) model time 0.0000 (0.0000) loss 7.3756 (7.4094) grad_norm 2.0183 (2.3584) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:51:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][60/625] eta 0:05:44 lr 0.000284 wd 0.0500 time 0.5704 (0.6096) data time 0.0008 (0.0071) model time 0.5696 (0.5779) loss 7.8200 (7.3925) grad_norm 2.1174 (2.3292) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:51:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][70/625] eta 0:05:40 lr 0.000284 wd 0.0500 time 0.5702 (0.6130) data time 0.0006 (0.0063) model time 0.5697 (0.6053) loss 8.4199 (7.3791) grad_norm 2.3680 (2.2825) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:51:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][80/625] eta 0:05:31 lr 0.000284 wd 0.0500 time 0.5659 (0.6086) data time 0.0008 (0.0056) model time 0.5651 (0.5957) loss 7.8094 (7.3791) grad_norm 2.6858 (2.3270) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:51:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][90/625] eta 0:05:23 lr 0.000284 wd 0.0500 time 0.5682 (0.6048) data time 0.0006 (0.0051) model time 0.5676 (0.5901) loss 6.7025 (7.3596) grad_norm 2.8668 (2.4199) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:51:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][100/625] eta 0:05:15 lr 0.000283 wd 0.0500 time 0.5701 (0.6017) data time 0.0006 (0.0047) model time 0.5695 (0.5867) loss 5.8521 (7.3651) grad_norm 2.1602 (2.4621) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:51:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][110/625] eta 0:05:08 lr 0.000283 wd 0.0500 time 0.5692 (0.5993) data time 0.0008 (0.0043) model time 0.5684 (0.5846) loss 7.7691 (7.3633) grad_norm 2.0756 (2.4299) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:51:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][120/625] eta 0:05:01 lr 0.000283 wd 0.0500 time 0.5706 (0.5973) data time 0.0008 (0.0040) model time 0.5698 (0.5832) loss 7.4500 (7.3822) grad_norm 2.0495 (2.4065) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:51:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][130/625] eta 0:04:54 lr 0.000283 wd 0.0500 time 0.5748 (0.5956) data time 0.0006 (0.0038) model time 0.5742 (0.5819) loss 6.1935 (7.3608) grad_norm 1.2945 (2.3752) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:51:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][140/625] eta 0:04:48 lr 0.000283 wd 0.0500 time 0.5701 (0.5942) data time 0.0009 (0.0036) model time 0.5692 (0.5811) loss 6.6687 (7.3620) grad_norm 2.5236 (2.4005) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:51:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][150/625] eta 0:04:41 lr 0.000283 wd 0.0500 time 0.5706 (0.5929) data time 0.0006 (0.0034) model time 0.5701 (0.5804) loss 7.9233 (7.3529) grad_norm 3.0368 (2.4246) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:52:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][160/625] eta 0:04:35 lr 0.000283 wd 0.0500 time 0.5752 (0.5918) data time 0.0007 (0.0032) model time 0.5746 (0.5799) loss 7.9824 (7.3657) grad_norm 3.4792 (2.4892) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:52:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][170/625] eta 0:04:28 lr 0.000283 wd 0.0500 time 0.5724 (0.5908) data time 0.0006 (0.0031) model time 0.5718 (0.5793) loss 6.9485 (7.3628) grad_norm 1.7358 (2.4843) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:52:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][180/625] eta 0:04:22 lr 0.000283 wd 0.0500 time 0.5714 (0.5899) data time 0.0007 (0.0030) model time 0.5707 (0.5789) loss 5.6534 (7.3652) grad_norm 1.7956 (2.4907) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:52:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][190/625] eta 0:04:16 lr 0.000283 wd 0.0500 time 0.5725 (0.5892) data time 0.0006 (0.0029) model time 0.5719 (0.5786) loss 6.3471 (7.3350) grad_norm 2.0784 (2.4775) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:52:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][200/625] eta 0:04:10 lr 0.000283 wd 0.0500 time 0.5744 (0.5891) data time 0.0008 (0.0028) model time 0.5736 (0.5792) loss 8.4459 (7.3251) grad_norm 2.6891 (2.4744) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:52:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][210/625] eta 0:04:04 lr 0.000282 wd 0.0500 time 0.5745 (0.5884) data time 0.0006 (0.0027) model time 0.5739 (0.5789) loss 7.9209 (7.3220) grad_norm 2.6375 (2.4751) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:52:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][220/625] eta 0:03:58 lr 0.000282 wd 0.0500 time 0.5735 (0.5878) data time 0.0006 (0.0026) model time 0.5729 (0.5786) loss 7.1100 (7.3301) grad_norm 2.0052 (2.4648) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:52:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][230/625] eta 0:03:52 lr 0.000282 wd 0.0500 time 0.5712 (0.5876) data time 0.0009 (0.0025) model time 0.5703 (0.5788) loss 6.1765 (7.3265) grad_norm 1.7435 (2.4610) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:52:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][240/625] eta 0:03:46 lr 0.000282 wd 0.0500 time 0.5733 (0.5888) data time 0.0008 (0.0025) model time 0.5725 (0.5807) loss 7.9313 (7.3267) grad_norm 1.6606 (2.4404) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 11:52:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][250/625] eta 0:03:40 lr 0.000282 wd 0.0500 time 0.5727 (0.5887) data time 0.0008 (0.0024) model time 0.5719 (0.5809) loss 7.4652 (7.3199) grad_norm 2.4762 (inf) loss_scale 512.0000 (1011.7610) mem 22339MB +[2024-07-25 11:52:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][260/625] eta 0:03:35 lr 0.000282 wd 0.0500 time 0.5711 (0.5900) data time 0.0007 (0.0023) model time 0.5703 (0.5830) loss 6.9489 (7.3083) grad_norm 2.4215 (inf) loss_scale 512.0000 (992.6130) mem 22339MB +[2024-07-25 11:53:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][270/625] eta 0:03:30 lr 0.000282 wd 0.0500 time 0.5665 (0.5920) data time 0.0008 (0.0023) model time 0.5657 (0.5856) loss 6.5299 (7.2973) grad_norm 1.7047 (inf) loss_scale 512.0000 (974.8782) mem 22339MB +[2024-07-25 11:53:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][280/625] eta 0:03:24 lr 0.000282 wd 0.0500 time 0.5692 (0.5915) data time 0.0007 (0.0022) model time 0.5686 (0.5853) loss 7.3762 (7.3069) grad_norm 2.5732 (inf) loss_scale 512.0000 (958.4057) mem 22339MB +[2024-07-25 11:53:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][290/625] eta 0:03:18 lr 0.000282 wd 0.0500 time 0.5731 (0.5919) data time 0.0008 (0.0022) model time 0.5723 (0.5860) loss 7.2085 (7.3016) grad_norm 3.7739 (inf) loss_scale 512.0000 (943.0653) mem 22339MB +[2024-07-25 11:53:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][300/625] eta 0:03:12 lr 0.000282 wd 0.0500 time 0.5748 (0.5918) data time 0.0009 (0.0021) model time 0.5739 (0.5860) loss 7.6439 (7.3001) grad_norm 2.7122 (inf) loss_scale 512.0000 (928.7442) mem 22339MB +[2024-07-25 11:53:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][310/625] eta 0:03:06 lr 0.000282 wd 0.0500 time 0.5746 (0.5912) data time 0.0008 (0.0021) model time 0.5738 (0.5855) loss 7.4068 (7.2949) grad_norm 4.6617 (inf) loss_scale 512.0000 (915.3441) mem 22339MB +[2024-07-25 11:53:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][320/625] eta 0:03:00 lr 0.000281 wd 0.0500 time 0.5603 (0.5907) data time 0.0007 (0.0021) model time 0.5595 (0.5851) loss 6.9214 (7.2907) grad_norm 1.7158 (inf) loss_scale 512.0000 (902.7788) mem 22339MB +[2024-07-25 11:53:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][330/625] eta 0:02:54 lr 0.000281 wd 0.0500 time 0.5708 (0.5902) data time 0.0006 (0.0020) model time 0.5701 (0.5847) loss 6.0343 (7.2998) grad_norm 4.4421 (inf) loss_scale 512.0000 (890.9728) mem 22339MB +[2024-07-25 11:53:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][340/625] eta 0:02:48 lr 0.000281 wd 0.0500 time 0.5769 (0.5898) data time 0.0008 (0.0020) model time 0.5761 (0.5843) loss 7.9949 (7.2975) grad_norm 3.1455 (inf) loss_scale 512.0000 (879.8592) mem 22339MB +[2024-07-25 11:53:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][350/625] eta 0:02:42 lr 0.000281 wd 0.0500 time 0.5717 (0.5893) data time 0.0006 (0.0020) model time 0.5710 (0.5839) loss 5.9007 (7.2957) grad_norm 1.8924 (inf) loss_scale 512.0000 (869.3789) mem 22339MB +[2024-07-25 11:53:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][360/625] eta 0:02:36 lr 0.000281 wd 0.0500 time 0.5628 (0.5889) data time 0.0006 (0.0019) model time 0.5621 (0.5836) loss 7.0172 (7.2968) grad_norm 2.2625 (inf) loss_scale 512.0000 (859.4792) mem 22339MB +[2024-07-25 11:54:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][370/625] eta 0:02:30 lr 0.000281 wd 0.0500 time 0.5683 (0.5885) data time 0.0006 (0.0019) model time 0.5677 (0.5832) loss 8.2688 (7.2926) grad_norm 1.8305 (inf) loss_scale 512.0000 (850.1132) mem 22339MB +[2024-07-25 11:54:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][380/625] eta 0:02:24 lr 0.000281 wd 0.0500 time 0.5725 (0.5881) data time 0.0008 (0.0019) model time 0.5717 (0.5830) loss 6.7029 (7.2827) grad_norm 1.9676 (inf) loss_scale 512.0000 (841.2388) mem 22339MB +[2024-07-25 11:54:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][390/625] eta 0:02:18 lr 0.000281 wd 0.0500 time 0.5741 (0.5877) data time 0.0006 (0.0019) model time 0.5735 (0.5827) loss 7.6229 (7.2734) grad_norm 2.7857 (inf) loss_scale 512.0000 (832.8184) mem 22339MB +[2024-07-25 11:54:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][400/625] eta 0:02:12 lr 0.000281 wd 0.0500 time 0.5704 (0.5874) data time 0.0007 (0.0018) model time 0.5697 (0.5824) loss 6.3986 (7.2688) grad_norm 3.0531 (inf) loss_scale 512.0000 (824.8180) mem 22339MB +[2024-07-25 11:54:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-25 11:54:23 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 11:54:26 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 12:38:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-25 12:38:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-25 12:38:39 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-25 12:38:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-25 12:38:49 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-25 12:38:50 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-25 12:38:50 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-25 12:38:50 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 211) +[2024-07-25 12:38:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-25 12:39:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][410/625] eta 0:07:40 lr 0.000281 wd 0.0500 time 0.5627 (2.1438) data time 0.0010 (0.1153) model time 0.5617 (2.0285) loss 7.8377 (7.9734) grad_norm 3.1517 (3.0701) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-25 12:39:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][420/625] eta 0:04:09 lr 0.000281 wd 0.0500 time 0.5643 (1.2151) data time 0.0010 (0.0480) model time 0.5634 (1.1671) loss 7.5615 (7.6415) grad_norm 2.3722 (2.5899) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-25 12:39:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][430/625] eta 0:03:10 lr 0.000281 wd 0.0500 time 0.5675 (0.9745) data time 0.0006 (0.0305) model time 0.5669 (0.9439) loss 7.7796 (7.5790) grad_norm 2.1899 (2.4205) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-25 12:39:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][440/625] eta 0:02:39 lr 0.000280 wd 0.0500 time 0.5617 (0.8636) data time 0.0009 (0.0225) model time 0.5608 (0.8411) loss 6.8742 (7.5523) grad_norm 1.8681 (2.3965) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-25 12:39:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][450/625] eta 0:02:20 lr 0.000280 wd 0.0500 time 0.5681 (0.8006) data time 0.0006 (0.0179) model time 0.5675 (0.7827) loss 7.0137 (7.5183) grad_norm 3.4230 (2.4521) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-25 12:39:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][460/625] eta 0:02:06 lr 0.000280 wd 0.0500 time 0.5641 (0.7662) data time 0.0008 (0.0149) model time 0.5633 (0.7513) loss 7.3693 (7.4352) grad_norm 2.0428 (2.4825) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-25 12:39:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][470/625] eta 0:01:54 lr 0.000280 wd 0.0500 time 0.5672 (0.7369) data time 0.0008 (0.0128) model time 0.5664 (0.7241) loss 8.2117 (7.3820) grad_norm 2.4842 (2.4807) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-25 12:39:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][480/625] eta 0:01:43 lr 0.000280 wd 0.0500 time 0.5679 (0.7152) data time 0.0009 (0.0113) model time 0.5671 (0.7039) loss 7.2278 (7.3455) grad_norm 2.1217 (2.4579) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-25 12:39:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][490/625] eta 0:01:34 lr 0.000280 wd 0.0500 time 0.5708 (0.6988) data time 0.0009 (0.0101) model time 0.5700 (0.6888) loss 6.6527 (7.3337) grad_norm 2.5224 (2.5438) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-25 12:40:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][500/625] eta 0:01:25 lr 0.000280 wd 0.0500 time 0.5718 (0.6857) data time 0.0008 (0.0091) model time 0.5710 (0.6766) loss 9.2898 (7.3604) grad_norm 3.3731 (2.5406) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-25 12:40:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][510/625] eta 0:01:17 lr 0.000280 wd 0.0500 time 0.5782 (0.6751) data time 0.0007 (0.0083) model time 0.5775 (0.6667) loss 6.2235 (7.3827) grad_norm 3.2449 (2.5274) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-25 12:40:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][520/625] eta 0:01:09 lr 0.000280 wd 0.0500 time 0.5763 (0.6661) data time 0.0009 (0.0077) model time 0.5755 (0.6584) loss 8.3293 (7.3773) grad_norm 3.7708 (2.5357) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-25 12:40:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][530/625] eta 0:01:02 lr 0.000280 wd 0.0500 time 0.5707 (0.6585) data time 0.0009 (0.0072) model time 0.5699 (0.6513) loss 8.1363 (7.3786) grad_norm 1.9893 (2.5927) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-25 12:40:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][540/625] eta 0:00:55 lr 0.000280 wd 0.0500 time 0.5666 (0.6521) data time 0.0006 (0.0067) model time 0.5659 (0.6454) loss 6.5820 (7.3797) grad_norm 2.0492 (2.5971) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-25 12:40:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][550/625] eta 0:00:48 lr 0.000279 wd 0.0500 time 0.5693 (0.6466) data time 0.0011 (0.0063) model time 0.5682 (0.6403) loss 7.3322 (7.3674) grad_norm 3.1306 (2.6293) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-25 12:40:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][560/625] eta 0:00:41 lr 0.000279 wd 0.0500 time 0.5717 (0.6418) data time 0.0009 (0.0060) model time 0.5708 (0.6358) loss 7.6029 (7.3637) grad_norm 1.7510 (2.6432) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-25 12:40:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][570/625] eta 0:00:35 lr 0.000279 wd 0.0500 time 0.5719 (0.6377) data time 0.0009 (0.0057) model time 0.5710 (0.6320) loss 8.2664 (7.3749) grad_norm 2.4594 (2.6180) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-25 12:40:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][580/625] eta 0:00:28 lr 0.000279 wd 0.0500 time 0.5684 (0.6338) data time 0.0010 (0.0054) model time 0.5674 (0.6284) loss 6.5078 (7.3601) grad_norm 2.2245 (2.5960) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-25 12:40:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][590/625] eta 0:00:22 lr 0.000279 wd 0.0500 time 0.5812 (0.6305) data time 0.0007 (0.0052) model time 0.5805 (0.6253) loss 7.0331 (7.3661) grad_norm 1.7056 (2.5712) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-25 12:40:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][600/625] eta 0:00:15 lr 0.000279 wd 0.0500 time 0.5680 (0.6274) data time 0.0007 (0.0049) model time 0.5673 (0.6225) loss 8.1210 (7.3556) grad_norm 1.7435 (2.5814) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-25 12:41:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][610/625] eta 0:00:09 lr 0.000279 wd 0.0500 time 0.5677 (0.6246) data time 0.0004 (0.0048) model time 0.5673 (0.6199) loss 7.2303 (7.3366) grad_norm 2.0420 (2.5584) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-25 12:41:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [211/300][620/625] eta 0:00:03 lr 0.000279 wd 0.0500 time 0.5741 (0.6221) data time 0.0004 (0.0046) model time 0.5736 (0.6176) loss 7.3492 (7.3223) grad_norm 2.3360 (2.5490) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-25 12:41:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 211 training takes 0:02:17 +[2024-07-25 12:41:12 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 12:41:15 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 12:41:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.511 (0.511) Loss 0.5010 (0.5010) Acc@1 90.771 (90.771) Acc@5 98.926 (98.926) Mem 22341MB +[2024-07-25 12:41:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.161) Loss 0.7969 (0.6326) Acc@1 81.641 (87.296) Acc@5 96.436 (97.909) Mem 22341MB +[2024-07-25 12:41:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.144) Loss 0.8774 (0.7217) Acc@1 79.004 (84.549) Acc@5 95.801 (97.049) Mem 22341MB +[2024-07-25 12:41:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.203 Acc@5 97.033 +[2024-07-25 12:41:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.2% +[2024-07-25 12:41:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 1.038 (1.038) Loss 0.5044 (0.5044) Acc@1 90.430 (90.430) Acc@5 98.926 (98.926) Mem 22341MB +[2024-07-25 12:41:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.208) Loss 0.7500 (0.6208) Acc@1 83.301 (87.567) Acc@5 96.875 (98.029) Mem 22341MB +[2024-07-25 12:41:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.169) Loss 0.8652 (0.7110) Acc@1 78.906 (84.647) Acc@5 95.947 (97.147) Mem 22341MB +[2024-07-25 12:41:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.267 Acc@5 97.127 +[2024-07-25 12:41:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.3% +[2024-07-25 12:41:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.27% +[2024-07-25 12:41:25 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 12:41:28 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 12:41:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][0/625] eta 0:11:47 lr 0.000279 wd 0.0500 time 1.1316 (1.1316) data time 0.4689 (0.4689) model time 0.0000 (0.0000) loss 7.7569 (7.7569) grad_norm 2.0908 (2.0908) loss_scale 512.0000 (512.0000) mem 22337MB +[2024-07-25 12:41:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][10/625] eta 0:06:21 lr 0.000279 wd 0.0500 time 0.5699 (0.6205) data time 0.0006 (0.0434) model time 0.0000 (0.0000) loss 7.3779 (7.3578) grad_norm 3.1420 (2.5784) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:41:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][20/625] eta 0:06:00 lr 0.000279 wd 0.0500 time 0.5686 (0.5960) data time 0.0007 (0.0231) model time 0.0000 (0.0000) loss 7.4966 (7.2415) grad_norm 2.1133 (2.4216) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:41:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][30/625] eta 0:05:49 lr 0.000279 wd 0.0500 time 0.5710 (0.5872) data time 0.0006 (0.0159) model time 0.0000 (0.0000) loss 8.0919 (7.1382) grad_norm 2.2797 (2.5198) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:41:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][40/625] eta 0:05:41 lr 0.000278 wd 0.0500 time 0.5706 (0.5831) data time 0.0008 (0.0122) model time 0.0000 (0.0000) loss 7.0312 (7.0637) grad_norm 2.8102 (2.5129) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:41:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][50/625] eta 0:05:33 lr 0.000278 wd 0.0500 time 0.5719 (0.5803) data time 0.0008 (0.0100) model time 0.0000 (0.0000) loss 6.7200 (7.0526) grad_norm 1.6044 (2.4336) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:42:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][60/625] eta 0:05:28 lr 0.000278 wd 0.0500 time 0.5710 (0.5821) data time 0.0009 (0.0085) model time 0.5701 (0.5900) loss 6.3437 (7.1323) grad_norm 2.4136 (2.5183) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:42:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][70/625] eta 0:05:22 lr 0.000278 wd 0.0500 time 0.5702 (0.5802) data time 0.0009 (0.0074) model time 0.5694 (0.5790) loss 7.1641 (7.1306) grad_norm 3.2921 (2.5943) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:42:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][80/625] eta 0:05:15 lr 0.000278 wd 0.0500 time 0.5691 (0.5789) data time 0.0009 (0.0066) model time 0.5682 (0.5756) loss 7.6196 (7.1004) grad_norm 3.2179 (2.6312) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:42:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][90/625] eta 0:05:09 lr 0.000278 wd 0.0500 time 0.5667 (0.5779) data time 0.0009 (0.0060) model time 0.5658 (0.5739) loss 9.0127 (7.1331) grad_norm 2.4706 (2.6509) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:42:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][100/625] eta 0:05:02 lr 0.000278 wd 0.0500 time 0.5694 (0.5770) data time 0.0008 (0.0055) model time 0.5686 (0.5728) loss 6.4819 (7.1865) grad_norm 3.1177 (2.6250) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:42:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][110/625] eta 0:04:56 lr 0.000278 wd 0.0500 time 0.5679 (0.5763) data time 0.0008 (0.0051) model time 0.5671 (0.5721) loss 8.3929 (7.1831) grad_norm 3.5510 (2.6455) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:42:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][120/625] eta 0:04:50 lr 0.000278 wd 0.0500 time 0.5716 (0.5758) data time 0.0006 (0.0047) model time 0.5710 (0.5716) loss 7.5279 (7.2083) grad_norm 1.8321 (2.5918) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:42:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][130/625] eta 0:04:44 lr 0.000278 wd 0.0500 time 0.5690 (0.5753) data time 0.0007 (0.0044) model time 0.5684 (0.5712) loss 8.1437 (7.2279) grad_norm 3.7480 (2.6033) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:42:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][140/625] eta 0:04:38 lr 0.000278 wd 0.0500 time 0.5698 (0.5749) data time 0.0007 (0.0041) model time 0.5692 (0.5711) loss 7.8907 (7.2241) grad_norm 4.8177 (2.6184) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:42:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][150/625] eta 0:04:32 lr 0.000277 wd 0.0500 time 0.5722 (0.5747) data time 0.0008 (0.0039) model time 0.5714 (0.5710) loss 7.0706 (7.2169) grad_norm 1.9218 (2.6431) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:43:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][160/625] eta 0:04:27 lr 0.000277 wd 0.0500 time 0.5687 (0.5744) data time 0.0006 (0.0037) model time 0.5682 (0.5708) loss 7.2262 (7.2176) grad_norm 2.4600 (2.6161) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:43:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][170/625] eta 0:04:21 lr 0.000277 wd 0.0500 time 0.5702 (0.5741) data time 0.0006 (0.0036) model time 0.5695 (0.5707) loss 6.5600 (7.1962) grad_norm 4.7457 (2.5977) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:43:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][180/625] eta 0:04:15 lr 0.000277 wd 0.0500 time 0.5718 (0.5738) data time 0.0006 (0.0034) model time 0.5712 (0.5705) loss 7.5867 (7.2159) grad_norm 2.0186 (2.6100) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:43:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][190/625] eta 0:04:09 lr 0.000277 wd 0.0500 time 0.5683 (0.5736) data time 0.0008 (0.0033) model time 0.5675 (0.5704) loss 7.2528 (7.2139) grad_norm 2.3536 (2.5960) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:43:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][200/625] eta 0:04:03 lr 0.000277 wd 0.0500 time 0.5696 (0.5734) data time 0.0006 (0.0031) model time 0.5690 (0.5702) loss 7.0913 (7.2172) grad_norm 1.7594 (2.5718) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:43:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][210/625] eta 0:03:57 lr 0.000277 wd 0.0500 time 0.5705 (0.5733) data time 0.0006 (0.0030) model time 0.5699 (0.5702) loss 7.2615 (7.2413) grad_norm 2.6714 (2.5703) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:43:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][220/625] eta 0:03:52 lr 0.000277 wd 0.0500 time 0.5802 (0.5732) data time 0.0006 (0.0029) model time 0.5796 (0.5702) loss 7.6895 (7.2603) grad_norm 2.7110 (2.5627) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:43:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][230/625] eta 0:03:46 lr 0.000277 wd 0.0500 time 0.5719 (0.5736) data time 0.0008 (0.0028) model time 0.5711 (0.5709) loss 6.5220 (7.2496) grad_norm 2.1740 (2.5572) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:43:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][240/625] eta 0:03:40 lr 0.000277 wd 0.0500 time 0.5693 (0.5735) data time 0.0006 (0.0028) model time 0.5687 (0.5708) loss 6.9239 (7.2294) grad_norm 2.6450 (2.5856) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:43:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][250/625] eta 0:03:34 lr 0.000277 wd 0.0500 time 0.5673 (0.5733) data time 0.0009 (0.0027) model time 0.5664 (0.5706) loss 7.0919 (7.2085) grad_norm 2.2832 (2.5743) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:43:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][260/625] eta 0:03:29 lr 0.000276 wd 0.0500 time 0.5721 (0.5731) data time 0.0007 (0.0026) model time 0.5714 (0.5706) loss 8.3090 (7.2120) grad_norm 3.4012 (2.5941) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:44:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][270/625] eta 0:03:23 lr 0.000276 wd 0.0500 time 0.5722 (0.5730) data time 0.0008 (0.0025) model time 0.5714 (0.5705) loss 8.3614 (7.2231) grad_norm 2.6142 (2.5829) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:44:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][280/625] eta 0:03:17 lr 0.000276 wd 0.0500 time 0.5747 (0.5736) data time 0.0007 (0.0025) model time 0.5739 (0.5712) loss 7.2406 (7.2238) grad_norm 1.9643 (2.5632) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:44:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][290/625] eta 0:03:12 lr 0.000276 wd 0.0500 time 0.5706 (0.5735) data time 0.0006 (0.0024) model time 0.5699 (0.5712) loss 7.5128 (7.2426) grad_norm 2.2277 (2.5463) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:44:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][300/625] eta 0:03:06 lr 0.000276 wd 0.0500 time 0.5695 (0.5734) data time 0.0007 (0.0024) model time 0.5688 (0.5711) loss 6.4385 (7.2443) grad_norm 2.7684 (2.5513) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:44:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][310/625] eta 0:03:00 lr 0.000276 wd 0.0500 time 0.5692 (0.5732) data time 0.0008 (0.0023) model time 0.5684 (0.5710) loss 7.2424 (7.2405) grad_norm 2.1844 (2.5398) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:44:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][320/625] eta 0:02:54 lr 0.000276 wd 0.0500 time 0.5689 (0.5731) data time 0.0006 (0.0023) model time 0.5682 (0.5710) loss 5.7423 (7.2386) grad_norm 2.2502 (2.5237) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:44:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][330/625] eta 0:02:49 lr 0.000276 wd 0.0500 time 0.5718 (0.5731) data time 0.0006 (0.0022) model time 0.5712 (0.5709) loss 8.1097 (7.2395) grad_norm 2.0210 (2.5162) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:44:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][340/625] eta 0:02:43 lr 0.000276 wd 0.0500 time 0.5672 (0.5729) data time 0.0007 (0.0022) model time 0.5666 (0.5708) loss 7.6599 (7.2431) grad_norm 2.1810 (2.5028) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:44:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][350/625] eta 0:02:37 lr 0.000276 wd 0.0500 time 0.5683 (0.5728) data time 0.0009 (0.0021) model time 0.5674 (0.5707) loss 6.3348 (7.2509) grad_norm 2.7500 (2.5056) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:44:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][360/625] eta 0:02:31 lr 0.000276 wd 0.0500 time 0.5697 (0.5728) data time 0.0006 (0.0021) model time 0.5691 (0.5707) loss 7.2157 (7.2604) grad_norm 2.1737 (2.4945) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:45:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][370/625] eta 0:02:26 lr 0.000275 wd 0.0500 time 0.5683 (0.5727) data time 0.0009 (0.0021) model time 0.5674 (0.5707) loss 7.5367 (7.2626) grad_norm 1.8094 (2.4879) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:45:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][380/625] eta 0:02:20 lr 0.000275 wd 0.0500 time 0.5877 (0.5727) data time 0.0006 (0.0020) model time 0.5871 (0.5707) loss 7.1254 (7.2535) grad_norm 2.1672 (2.4753) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:45:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][390/625] eta 0:02:14 lr 0.000275 wd 0.0500 time 0.5682 (0.5726) data time 0.0008 (0.0020) model time 0.5674 (0.5706) loss 5.8532 (7.2528) grad_norm 1.8982 (2.4670) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:45:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][400/625] eta 0:02:08 lr 0.000275 wd 0.0500 time 0.5747 (0.5725) data time 0.0008 (0.0020) model time 0.5738 (0.5706) loss 7.3277 (7.2585) grad_norm 2.6675 (2.4652) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:45:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][410/625] eta 0:02:03 lr 0.000275 wd 0.0500 time 0.5685 (0.5724) data time 0.0009 (0.0020) model time 0.5676 (0.5705) loss 8.5934 (7.2572) grad_norm 2.1444 (2.4815) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:45:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][420/625] eta 0:01:57 lr 0.000275 wd 0.0500 time 0.5721 (0.5723) data time 0.0008 (0.0019) model time 0.5713 (0.5704) loss 6.0622 (7.2495) grad_norm 2.2454 (2.4756) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:45:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][430/625] eta 0:01:51 lr 0.000275 wd 0.0500 time 0.5706 (0.5723) data time 0.0008 (0.0019) model time 0.5699 (0.5704) loss 6.8120 (7.2479) grad_norm 1.6203 (2.4697) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:45:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][440/625] eta 0:01:45 lr 0.000275 wd 0.0500 time 0.5727 (0.5723) data time 0.0009 (0.0019) model time 0.5718 (0.5704) loss 7.2792 (7.2359) grad_norm 1.5258 (2.4594) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:45:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][450/625] eta 0:01:40 lr 0.000275 wd 0.0500 time 0.5701 (0.5724) data time 0.0008 (0.0019) model time 0.5693 (0.5705) loss 7.4366 (7.2381) grad_norm 1.6587 (2.4787) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:45:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][460/625] eta 0:01:34 lr 0.000275 wd 0.0500 time 0.5708 (0.5723) data time 0.0007 (0.0018) model time 0.5700 (0.5705) loss 7.7821 (7.2416) grad_norm 2.6290 (2.4835) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:45:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][470/625] eta 0:01:28 lr 0.000275 wd 0.0500 time 0.5903 (0.5723) data time 0.0008 (0.0018) model time 0.5895 (0.5705) loss 7.8777 (7.2341) grad_norm 3.4813 (2.4873) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:46:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][480/625] eta 0:01:22 lr 0.000275 wd 0.0500 time 0.5749 (0.5723) data time 0.0007 (0.0018) model time 0.5742 (0.5705) loss 5.9823 (7.2301) grad_norm 2.3371 (2.4876) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:46:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][490/625] eta 0:01:17 lr 0.000274 wd 0.0500 time 0.5733 (0.5722) data time 0.0006 (0.0018) model time 0.5726 (0.5705) loss 6.4208 (7.2302) grad_norm 2.6574 (2.4838) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:46:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][500/625] eta 0:01:11 lr 0.000274 wd 0.0500 time 0.5704 (0.5726) data time 0.0006 (0.0018) model time 0.5698 (0.5709) loss 7.8502 (7.2313) grad_norm 1.8435 (2.4770) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:46:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][510/625] eta 0:01:05 lr 0.000274 wd 0.0500 time 0.5677 (0.5726) data time 0.0008 (0.0017) model time 0.5669 (0.5709) loss 7.6963 (7.2361) grad_norm 2.1850 (2.4687) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:46:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][520/625] eta 0:01:00 lr 0.000274 wd 0.0500 time 0.5718 (0.5725) data time 0.0008 (0.0017) model time 0.5710 (0.5709) loss 6.4859 (7.2376) grad_norm 1.7944 (2.4631) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:46:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][530/625] eta 0:00:54 lr 0.000274 wd 0.0500 time 0.5708 (0.5725) data time 0.0006 (0.0017) model time 0.5702 (0.5708) loss 8.5995 (7.2387) grad_norm 1.5241 (2.4623) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:46:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][540/625] eta 0:00:48 lr 0.000274 wd 0.0500 time 0.5706 (0.5724) data time 0.0006 (0.0017) model time 0.5700 (0.5708) loss 7.9179 (7.2393) grad_norm 2.0761 (2.4543) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:46:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][550/625] eta 0:00:42 lr 0.000274 wd 0.0500 time 0.5708 (0.5724) data time 0.0006 (0.0017) model time 0.5701 (0.5708) loss 7.7668 (7.2444) grad_norm 3.3650 (2.4631) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:46:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][560/625] eta 0:00:37 lr 0.000274 wd 0.0500 time 0.5750 (0.5724) data time 0.0006 (0.0017) model time 0.5744 (0.5707) loss 6.9046 (7.2507) grad_norm 7.9071 (2.4760) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:46:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][570/625] eta 0:00:31 lr 0.000274 wd 0.0500 time 0.5727 (0.5723) data time 0.0008 (0.0016) model time 0.5719 (0.5707) loss 9.2304 (7.2510) grad_norm 1.8501 (2.4734) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:47:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][580/625] eta 0:00:25 lr 0.000274 wd 0.0500 time 0.5737 (0.5723) data time 0.0006 (0.0016) model time 0.5731 (0.5707) loss 6.0820 (7.2492) grad_norm 1.5919 (2.4673) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:47:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][590/625] eta 0:00:20 lr 0.000274 wd 0.0500 time 0.5692 (0.5723) data time 0.0008 (0.0016) model time 0.5684 (0.5707) loss 8.2410 (7.2431) grad_norm 2.3317 (2.4690) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:47:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][600/625] eta 0:00:14 lr 0.000273 wd 0.0500 time 0.5710 (0.5723) data time 0.0007 (0.0016) model time 0.5703 (0.5707) loss 7.0911 (7.2473) grad_norm 5.9534 (2.4805) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:47:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][610/625] eta 0:00:08 lr 0.000273 wd 0.0500 time 0.5717 (0.5722) data time 0.0004 (0.0016) model time 0.5713 (0.5707) loss 6.6794 (7.2451) grad_norm 1.6671 (2.4810) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:47:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [212/300][620/625] eta 0:00:02 lr 0.000273 wd 0.0500 time 0.5706 (0.5722) data time 0.0006 (0.0016) model time 0.5700 (0.5706) loss 5.8417 (7.2422) grad_norm 2.3400 (2.4881) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:47:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 212 training takes 0:05:57 +[2024-07-25 12:47:26 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 12:47:27 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 12:47:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.464 (0.464) Loss 0.5068 (0.5068) Acc@1 90.430 (90.430) Acc@5 98.730 (98.730) Mem 22339MB +[2024-07-25 12:47:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.156) Loss 0.7578 (0.6211) Acc@1 82.812 (87.433) Acc@5 96.484 (97.874) Mem 22339MB +[2024-07-25 12:47:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.8438 (0.7125) Acc@1 80.078 (84.677) Acc@5 96.387 (97.091) Mem 22339MB +[2024-07-25 12:47:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.295 Acc@5 97.085 +[2024-07-25 12:47:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.3% +[2024-07-25 12:47:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 84.30% +[2024-07-25 12:47:30 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 12:47:32 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 12:47:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.461 (0.461) Loss 0.5044 (0.5044) Acc@1 90.430 (90.430) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-25 12:47:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.156) Loss 0.7490 (0.6205) Acc@1 83.252 (87.571) Acc@5 96.777 (98.025) Mem 22339MB +[2024-07-25 12:47:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.141) Loss 0.8652 (0.7106) Acc@1 78.857 (84.663) Acc@5 95.947 (97.147) Mem 22339MB +[2024-07-25 12:47:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.281 Acc@5 97.127 +[2024-07-25 12:47:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.3% +[2024-07-25 12:47:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.28% +[2024-07-25 12:47:35 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 12:47:37 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 12:47:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][0/625] eta 0:09:35 lr 0.000273 wd 0.0500 time 0.9205 (0.9205) data time 0.4004 (0.4004) model time 0.0000 (0.0000) loss 8.3215 (8.3215) grad_norm 2.9256 (2.9256) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:47:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][10/625] eta 0:06:10 lr 0.000273 wd 0.0500 time 0.5713 (0.6021) data time 0.0008 (0.0371) model time 0.0000 (0.0000) loss 6.5299 (7.4605) grad_norm 3.3283 (3.7452) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:47:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][20/625] eta 0:05:55 lr 0.000273 wd 0.0500 time 0.5690 (0.5870) data time 0.0006 (0.0198) model time 0.0000 (0.0000) loss 6.6669 (7.2176) grad_norm 1.9097 (3.0773) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:47:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][30/625] eta 0:05:46 lr 0.000273 wd 0.0500 time 0.5663 (0.5817) data time 0.0006 (0.0137) model time 0.0000 (0.0000) loss 5.7823 (7.2127) grad_norm 1.6182 (2.8045) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:48:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][40/625] eta 0:05:38 lr 0.000273 wd 0.0500 time 0.5676 (0.5789) data time 0.0007 (0.0106) model time 0.0000 (0.0000) loss 6.7538 (7.2523) grad_norm 1.9504 (2.7465) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:48:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][50/625] eta 0:05:32 lr 0.000273 wd 0.0500 time 0.5683 (0.5775) data time 0.0007 (0.0087) model time 0.0000 (0.0000) loss 6.9067 (7.2626) grad_norm 3.0444 (2.6655) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:48:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][60/625] eta 0:05:25 lr 0.000273 wd 0.0500 time 0.5641 (0.5763) data time 0.0006 (0.0074) model time 0.5635 (0.5694) loss 5.8079 (7.2765) grad_norm 4.1070 (2.6920) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:48:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][70/625] eta 0:05:19 lr 0.000273 wd 0.0500 time 0.5645 (0.5754) data time 0.0008 (0.0065) model time 0.5636 (0.5691) loss 7.6980 (7.2735) grad_norm 2.2214 (2.7076) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:48:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][80/625] eta 0:05:13 lr 0.000273 wd 0.0500 time 0.5698 (0.5748) data time 0.0008 (0.0058) model time 0.5690 (0.5693) loss 7.4016 (7.3024) grad_norm 1.9384 (2.7058) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:48:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][90/625] eta 0:05:08 lr 0.000272 wd 0.0500 time 0.5694 (0.5763) data time 0.0008 (0.0052) model time 0.5686 (0.5738) loss 8.9583 (7.3105) grad_norm 1.9463 (2.7155) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:48:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][100/625] eta 0:05:02 lr 0.000272 wd 0.0500 time 0.5691 (0.5757) data time 0.0006 (0.0048) model time 0.5684 (0.5731) loss 6.2855 (7.2672) grad_norm 1.9275 (2.6582) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:48:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][110/625] eta 0:04:56 lr 0.000272 wd 0.0500 time 0.5635 (0.5755) data time 0.0009 (0.0044) model time 0.5626 (0.5729) loss 6.9060 (7.2357) grad_norm 2.4333 (2.7232) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:48:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][120/625] eta 0:04:50 lr 0.000272 wd 0.0500 time 0.5703 (0.5752) data time 0.0008 (0.0041) model time 0.5696 (0.5726) loss 8.1336 (7.2623) grad_norm 2.0309 (2.7171) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:48:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][130/625] eta 0:04:44 lr 0.000272 wd 0.0500 time 0.5656 (0.5749) data time 0.0008 (0.0039) model time 0.5648 (0.5724) loss 5.7338 (7.2439) grad_norm 4.6433 (2.7727) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:48:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][140/625] eta 0:04:38 lr 0.000272 wd 0.0500 time 0.5635 (0.5747) data time 0.0007 (0.0037) model time 0.5628 (0.5722) loss 9.3042 (7.2575) grad_norm 3.5056 (2.7906) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:49:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][150/625] eta 0:04:32 lr 0.000272 wd 0.0500 time 0.5659 (0.5746) data time 0.0006 (0.0035) model time 0.5653 (0.5723) loss 5.7144 (7.2612) grad_norm 1.8691 (2.7673) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:49:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][160/625] eta 0:04:27 lr 0.000272 wd 0.0500 time 0.5684 (0.5744) data time 0.0009 (0.0033) model time 0.5675 (0.5721) loss 7.5552 (7.2731) grad_norm 13.3554 (2.8049) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:49:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][170/625] eta 0:04:21 lr 0.000272 wd 0.0500 time 0.5714 (0.5742) data time 0.0008 (0.0032) model time 0.5706 (0.5720) loss 6.6129 (7.2913) grad_norm 2.2309 (2.7740) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:49:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][180/625] eta 0:04:15 lr 0.000272 wd 0.0500 time 0.5691 (0.5746) data time 0.0008 (0.0030) model time 0.5683 (0.5726) loss 7.4873 (7.2693) grad_norm 2.2768 (2.7503) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:49:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][190/625] eta 0:04:10 lr 0.000272 wd 0.0500 time 0.5677 (0.5748) data time 0.0008 (0.0030) model time 0.5669 (0.5729) loss 9.0653 (7.2812) grad_norm 2.1993 (2.7368) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:49:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][200/625] eta 0:04:04 lr 0.000271 wd 0.0500 time 0.5695 (0.5751) data time 0.0006 (0.0029) model time 0.5688 (0.5735) loss 6.6905 (7.2624) grad_norm 2.3279 (2.7065) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:49:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][210/625] eta 0:03:58 lr 0.000271 wd 0.0500 time 0.5704 (0.5751) data time 0.0009 (0.0028) model time 0.5695 (0.5735) loss 6.9532 (7.2772) grad_norm 1.8000 (2.6835) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:49:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][220/625] eta 0:03:53 lr 0.000271 wd 0.0500 time 0.5647 (0.5755) data time 0.0009 (0.0027) model time 0.5638 (0.5740) loss 7.5142 (7.2765) grad_norm 2.0278 (2.6771) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:49:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][230/625] eta 0:03:47 lr 0.000271 wd 0.0500 time 0.5681 (0.5756) data time 0.0008 (0.0027) model time 0.5673 (0.5740) loss 7.3550 (7.2699) grad_norm 2.9215 (2.6621) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:49:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][240/625] eta 0:03:41 lr 0.000271 wd 0.0500 time 0.5685 (0.5758) data time 0.0007 (0.0026) model time 0.5679 (0.5744) loss 7.8132 (7.2787) grad_norm 2.4380 (2.6991) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:50:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][250/625] eta 0:03:35 lr 0.000271 wd 0.0500 time 0.5676 (0.5758) data time 0.0007 (0.0025) model time 0.5669 (0.5744) loss 7.7835 (7.2738) grad_norm 1.8885 (2.6894) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:50:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][260/625] eta 0:03:30 lr 0.000271 wd 0.0500 time 0.5624 (0.5756) data time 0.0007 (0.0025) model time 0.5617 (0.5742) loss 7.5647 (7.2812) grad_norm 2.6586 (2.6878) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:50:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][270/625] eta 0:03:24 lr 0.000271 wd 0.0500 time 0.5657 (0.5759) data time 0.0008 (0.0025) model time 0.5649 (0.5745) loss 7.3760 (7.2935) grad_norm 2.2932 (2.6996) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:50:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][280/625] eta 0:03:18 lr 0.000271 wd 0.0500 time 0.5628 (0.5758) data time 0.0008 (0.0024) model time 0.5619 (0.5745) loss 7.5057 (7.2959) grad_norm 1.7982 (2.6995) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:50:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][290/625] eta 0:03:12 lr 0.000271 wd 0.0500 time 0.5682 (0.5757) data time 0.0008 (0.0024) model time 0.5674 (0.5743) loss 7.8167 (7.2958) grad_norm 2.1436 (2.7106) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:50:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][300/625] eta 0:03:07 lr 0.000271 wd 0.0500 time 0.5632 (0.5755) data time 0.0006 (0.0023) model time 0.5626 (0.5741) loss 8.0136 (7.3069) grad_norm 2.6404 (2.6950) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:50:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][310/625] eta 0:03:01 lr 0.000270 wd 0.0500 time 0.5734 (0.5760) data time 0.0006 (0.0023) model time 0.5727 (0.5747) loss 6.9458 (7.3043) grad_norm 1.8005 (2.6969) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:50:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][320/625] eta 0:02:55 lr 0.000270 wd 0.0500 time 0.5664 (0.5765) data time 0.0008 (0.0022) model time 0.5656 (0.5753) loss 7.3016 (7.2891) grad_norm 1.8389 (2.6917) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:50:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][330/625] eta 0:02:50 lr 0.000270 wd 0.0500 time 0.5713 (0.5764) data time 0.0007 (0.0022) model time 0.5707 (0.5752) loss 7.4854 (7.3023) grad_norm 1.8584 (2.6696) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:50:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][340/625] eta 0:02:44 lr 0.000270 wd 0.0500 time 0.5671 (0.5762) data time 0.0008 (0.0021) model time 0.5663 (0.5749) loss 8.1819 (7.3222) grad_norm 2.3680 (2.6553) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:50:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][350/625] eta 0:02:38 lr 0.000270 wd 0.0500 time 0.5658 (0.5760) data time 0.0006 (0.0021) model time 0.5652 (0.5747) loss 7.0029 (7.3128) grad_norm 2.5221 (2.6462) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:51:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][360/625] eta 0:02:32 lr 0.000270 wd 0.0500 time 0.5647 (0.5759) data time 0.0006 (0.0021) model time 0.5641 (0.5746) loss 7.4336 (7.3123) grad_norm 3.7090 (2.6427) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:51:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][370/625] eta 0:02:26 lr 0.000270 wd 0.0500 time 0.5709 (0.5757) data time 0.0006 (0.0020) model time 0.5703 (0.5745) loss 7.8882 (7.3041) grad_norm 3.4070 (2.6429) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:51:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][380/625] eta 0:02:21 lr 0.000270 wd 0.0500 time 0.5706 (0.5756) data time 0.0008 (0.0020) model time 0.5698 (0.5744) loss 7.0744 (7.3070) grad_norm 2.1381 (2.6316) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:51:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][390/625] eta 0:02:15 lr 0.000270 wd 0.0500 time 0.5190 (0.5759) data time 0.0008 (0.0020) model time 0.5182 (0.5747) loss 7.2158 (7.3068) grad_norm 2.8753 (2.6343) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:51:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][400/625] eta 0:02:09 lr 0.000270 wd 0.0500 time 0.5653 (0.5757) data time 0.0007 (0.0019) model time 0.5645 (0.5745) loss 7.4214 (7.3116) grad_norm 1.9692 (2.6290) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:51:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][410/625] eta 0:02:03 lr 0.000270 wd 0.0500 time 0.5679 (0.5756) data time 0.0009 (0.0019) model time 0.5670 (0.5744) loss 6.7293 (7.3014) grad_norm 1.7308 (2.6338) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:51:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][420/625] eta 0:01:57 lr 0.000270 wd 0.0500 time 0.5716 (0.5755) data time 0.0006 (0.0019) model time 0.5709 (0.5742) loss 7.1882 (7.3136) grad_norm 2.7406 (2.6588) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:51:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][430/625] eta 0:01:52 lr 0.000269 wd 0.0500 time 0.5666 (0.5753) data time 0.0006 (0.0019) model time 0.5660 (0.5741) loss 7.0480 (7.3164) grad_norm 1.9453 (2.6532) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:51:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][440/625] eta 0:01:46 lr 0.000269 wd 0.0500 time 0.5679 (0.5752) data time 0.0010 (0.0018) model time 0.5669 (0.5740) loss 8.2409 (7.3234) grad_norm 2.2566 (2.6519) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:51:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][450/625] eta 0:01:40 lr 0.000269 wd 0.0500 time 0.5658 (0.5751) data time 0.0007 (0.0018) model time 0.5652 (0.5739) loss 8.1873 (7.3231) grad_norm 3.8175 (2.6493) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:52:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][460/625] eta 0:01:34 lr 0.000269 wd 0.0500 time 0.5715 (0.5750) data time 0.0007 (0.0018) model time 0.5708 (0.5738) loss 6.4157 (7.3170) grad_norm 2.1850 (2.6537) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:52:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][470/625] eta 0:01:29 lr 0.000269 wd 0.0500 time 0.5697 (0.5749) data time 0.0007 (0.0018) model time 0.5691 (0.5737) loss 6.2433 (7.3082) grad_norm 2.5002 (2.6581) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:52:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][480/625] eta 0:01:23 lr 0.000269 wd 0.0500 time 0.5674 (0.5749) data time 0.0008 (0.0017) model time 0.5666 (0.5736) loss 6.5223 (7.3031) grad_norm 2.1623 (2.6520) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:52:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][490/625] eta 0:01:17 lr 0.000269 wd 0.0500 time 0.5690 (0.5748) data time 0.0008 (0.0017) model time 0.5682 (0.5735) loss 7.8563 (7.3070) grad_norm 1.9543 (2.6481) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:52:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][500/625] eta 0:01:11 lr 0.000269 wd 0.0500 time 0.5666 (0.5747) data time 0.0009 (0.0017) model time 0.5658 (0.5734) loss 6.8191 (7.3096) grad_norm 2.9407 (2.6652) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:52:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][510/625] eta 0:01:06 lr 0.000269 wd 0.0500 time 0.5711 (0.5746) data time 0.0006 (0.0017) model time 0.5705 (0.5733) loss 8.6201 (7.3117) grad_norm 2.4701 (2.6773) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:52:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][520/625] eta 0:01:00 lr 0.000269 wd 0.0500 time 0.5662 (0.5745) data time 0.0008 (0.0017) model time 0.5654 (0.5733) loss 7.6845 (7.3232) grad_norm 2.7104 (2.6731) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:52:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][530/625] eta 0:00:54 lr 0.000269 wd 0.0500 time 0.6762 (0.5750) data time 0.0008 (0.0017) model time 0.6754 (0.5738) loss 6.5418 (7.3239) grad_norm 2.6776 (2.6644) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:52:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][540/625] eta 0:00:48 lr 0.000268 wd 0.0500 time 0.5682 (0.5749) data time 0.0009 (0.0016) model time 0.5674 (0.5738) loss 7.2880 (7.3252) grad_norm 2.1627 (2.6571) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:52:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][550/625] eta 0:00:43 lr 0.000268 wd 0.0500 time 0.5699 (0.5749) data time 0.0008 (0.0016) model time 0.5691 (0.5737) loss 7.5512 (7.3214) grad_norm 3.8373 (2.6642) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:52:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][560/625] eta 0:00:37 lr 0.000268 wd 0.0500 time 0.5673 (0.5748) data time 0.0006 (0.0016) model time 0.5666 (0.5736) loss 7.5572 (7.3270) grad_norm 2.6230 (2.6563) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:53:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][570/625] eta 0:00:31 lr 0.000268 wd 0.0500 time 0.5667 (0.5747) data time 0.0008 (0.0016) model time 0.5659 (0.5735) loss 6.5992 (7.3274) grad_norm 2.2270 (2.6635) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:53:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][580/625] eta 0:00:25 lr 0.000268 wd 0.0500 time 0.5635 (0.5746) data time 0.0007 (0.0016) model time 0.5628 (0.5734) loss 6.8087 (7.3276) grad_norm 2.2179 (2.6686) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:53:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][590/625] eta 0:00:20 lr 0.000268 wd 0.0500 time 0.5681 (0.5745) data time 0.0007 (0.0016) model time 0.5675 (0.5733) loss 6.8926 (7.3265) grad_norm 3.0666 (2.6659) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:53:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][600/625] eta 0:00:14 lr 0.000268 wd 0.0500 time 0.5713 (0.5745) data time 0.0006 (0.0016) model time 0.5707 (0.5733) loss 7.9984 (7.3286) grad_norm 2.5603 (2.6593) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:53:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][610/625] eta 0:00:08 lr 0.000268 wd 0.0500 time 0.5661 (0.5744) data time 0.0006 (0.0016) model time 0.5656 (0.5732) loss 7.6816 (7.3295) grad_norm 2.2114 (2.6551) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:53:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [213/300][620/625] eta 0:00:02 lr 0.000268 wd 0.0500 time 0.5691 (0.5745) data time 0.0005 (0.0015) model time 0.5686 (0.5733) loss 7.3115 (7.3301) grad_norm 2.2582 (2.6502) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:53:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 213 training takes 0:05:59 +[2024-07-25 12:53:36 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 12:53:39 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 12:53:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.458 (0.458) Loss 0.5103 (0.5103) Acc@1 89.844 (89.844) Acc@5 98.828 (98.828) Mem 22339MB +[2024-07-25 12:53:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.156) Loss 0.7700 (0.6247) Acc@1 82.812 (87.291) Acc@5 96.631 (97.936) Mem 22339MB +[2024-07-25 12:53:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.8672 (0.7217) Acc@1 79.297 (84.584) Acc@5 95.996 (97.024) Mem 22339MB +[2024-07-25 12:53:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.261 Acc@5 97.001 +[2024-07-25 12:53:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.3% +[2024-07-25 12:53:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.948 (0.948) Loss 0.5049 (0.5049) Acc@1 90.430 (90.430) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 12:53:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.200) Loss 0.7490 (0.6205) Acc@1 83.203 (87.593) Acc@5 96.826 (98.029) Mem 22339MB +[2024-07-25 12:53:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.165) Loss 0.8643 (0.7104) Acc@1 78.809 (84.689) Acc@5 95.996 (97.166) Mem 22339MB +[2024-07-25 12:53:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.305 Acc@5 97.147 +[2024-07-25 12:53:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.3% +[2024-07-25 12:53:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.31% +[2024-07-25 12:53:46 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 12:53:48 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 12:53:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][0/625] eta 0:09:30 lr 0.000268 wd 0.0500 time 0.9128 (0.9128) data time 0.3957 (0.3957) model time 0.0000 (0.0000) loss 8.4335 (8.4335) grad_norm 2.1565 (2.1565) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:53:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][10/625] eta 0:06:09 lr 0.000268 wd 0.0500 time 0.5786 (0.6002) data time 0.0006 (0.0367) model time 0.0000 (0.0000) loss 6.7003 (6.8758) grad_norm 1.8545 (2.0059) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:54:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][20/625] eta 0:05:53 lr 0.000268 wd 0.0500 time 0.5703 (0.5851) data time 0.0006 (0.0196) model time 0.0000 (0.0000) loss 7.8762 (6.7700) grad_norm 2.1376 (2.0137) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:54:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][30/625] eta 0:05:45 lr 0.000267 wd 0.0500 time 0.5668 (0.5804) data time 0.0007 (0.0135) model time 0.0000 (0.0000) loss 6.8664 (6.9025) grad_norm 1.9962 (2.0765) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:54:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][40/625] eta 0:05:37 lr 0.000267 wd 0.0500 time 0.5665 (0.5777) data time 0.0006 (0.0104) model time 0.0000 (0.0000) loss 7.8921 (7.1054) grad_norm 2.7884 (2.0676) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:54:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][50/625] eta 0:05:31 lr 0.000267 wd 0.0500 time 0.5681 (0.5760) data time 0.0006 (0.0085) model time 0.0000 (0.0000) loss 7.7562 (7.1352) grad_norm 3.2017 (2.2343) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:54:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][60/625] eta 0:05:24 lr 0.000267 wd 0.0500 time 0.5651 (0.5748) data time 0.0009 (0.0073) model time 0.5643 (0.5683) loss 7.7159 (7.1266) grad_norm 2.5795 (2.3098) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:54:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][70/625] eta 0:05:18 lr 0.000267 wd 0.0500 time 0.5664 (0.5741) data time 0.0008 (0.0064) model time 0.5656 (0.5685) loss 7.1804 (7.1476) grad_norm 2.0849 (2.2853) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:54:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][80/625] eta 0:05:12 lr 0.000267 wd 0.0500 time 0.5658 (0.5737) data time 0.0006 (0.0057) model time 0.5652 (0.5690) loss 6.3192 (7.1279) grad_norm 2.2159 (2.3068) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:54:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][90/625] eta 0:05:06 lr 0.000267 wd 0.0500 time 0.5700 (0.5733) data time 0.0007 (0.0051) model time 0.5693 (0.5691) loss 5.2318 (7.1453) grad_norm 1.8990 (2.3067) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:54:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][100/625] eta 0:05:00 lr 0.000267 wd 0.0500 time 0.5671 (0.5729) data time 0.0006 (0.0047) model time 0.5665 (0.5690) loss 7.0866 (7.1320) grad_norm 3.4823 (2.3462) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:54:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][110/625] eta 0:04:54 lr 0.000267 wd 0.0500 time 0.5635 (0.5728) data time 0.0006 (0.0043) model time 0.5629 (0.5694) loss 7.4408 (7.1281) grad_norm 2.5530 (2.3539) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:54:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][120/625] eta 0:04:50 lr 0.000267 wd 0.0500 time 0.7731 (0.5744) data time 0.0007 (0.0040) model time 0.7724 (0.5725) loss 8.1315 (7.1651) grad_norm 2.9639 (2.3720) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:55:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][130/625] eta 0:04:44 lr 0.000267 wd 0.0500 time 0.5678 (0.5750) data time 0.0008 (0.0038) model time 0.5670 (0.5736) loss 7.9605 (7.1883) grad_norm 1.8091 (2.3819) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:55:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][140/625] eta 0:04:38 lr 0.000267 wd 0.0500 time 0.5677 (0.5747) data time 0.0006 (0.0036) model time 0.5670 (0.5732) loss 7.3445 (7.1996) grad_norm 2.0954 (2.4477) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:55:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][150/625] eta 0:04:33 lr 0.000266 wd 0.0500 time 0.5640 (0.5753) data time 0.0008 (0.0034) model time 0.5632 (0.5742) loss 7.4826 (7.1880) grad_norm 2.0583 (2.4489) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:55:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][160/625] eta 0:04:27 lr 0.000266 wd 0.0500 time 0.5673 (0.5749) data time 0.0008 (0.0032) model time 0.5665 (0.5737) loss 7.9529 (7.1926) grad_norm 1.6362 (2.4356) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:55:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][170/625] eta 0:04:21 lr 0.000266 wd 0.0500 time 0.5708 (0.5746) data time 0.0008 (0.0031) model time 0.5700 (0.5733) loss 7.2825 (7.2058) grad_norm 2.2099 (2.4222) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:55:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][180/625] eta 0:04:15 lr 0.000266 wd 0.0500 time 0.5686 (0.5744) data time 0.0006 (0.0030) model time 0.5681 (0.5730) loss 6.5021 (7.1713) grad_norm 2.2505 (2.4234) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:55:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][190/625] eta 0:04:09 lr 0.000266 wd 0.0500 time 0.5671 (0.5742) data time 0.0008 (0.0029) model time 0.5663 (0.5727) loss 6.4392 (7.1501) grad_norm 2.1792 (2.4228) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:55:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][200/625] eta 0:04:03 lr 0.000266 wd 0.0500 time 0.5697 (0.5740) data time 0.0006 (0.0027) model time 0.5691 (0.5726) loss 6.1940 (7.1420) grad_norm 3.6640 (2.4304) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:55:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][210/625] eta 0:03:58 lr 0.000266 wd 0.0500 time 0.5695 (0.5738) data time 0.0008 (0.0027) model time 0.5687 (0.5724) loss 6.7134 (7.1605) grad_norm 1.7379 (2.4233) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:55:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][220/625] eta 0:03:52 lr 0.000266 wd 0.0500 time 0.5656 (0.5736) data time 0.0006 (0.0026) model time 0.5649 (0.5721) loss 7.3194 (7.1657) grad_norm 1.6525 (2.4322) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:56:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][230/625] eta 0:03:46 lr 0.000266 wd 0.0500 time 0.5689 (0.5734) data time 0.0008 (0.0025) model time 0.5681 (0.5719) loss 8.2554 (7.1803) grad_norm 2.8823 (2.4716) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:56:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][240/625] eta 0:03:40 lr 0.000266 wd 0.0500 time 0.5734 (0.5732) data time 0.0008 (0.0024) model time 0.5725 (0.5717) loss 7.7988 (7.1745) grad_norm 2.3961 (2.4730) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:56:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][250/625] eta 0:03:34 lr 0.000266 wd 0.0500 time 0.5689 (0.5731) data time 0.0008 (0.0024) model time 0.5681 (0.5716) loss 7.3486 (7.1665) grad_norm 1.8781 (2.4566) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:56:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][260/625] eta 0:03:29 lr 0.000265 wd 0.0500 time 0.5693 (0.5731) data time 0.0006 (0.0023) model time 0.5687 (0.5716) loss 7.5116 (7.1838) grad_norm 2.0717 (2.4406) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:56:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][270/625] eta 0:03:23 lr 0.000265 wd 0.0500 time 0.5695 (0.5730) data time 0.0007 (0.0022) model time 0.5688 (0.5715) loss 7.7435 (7.1825) grad_norm 1.9985 (2.4314) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:56:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][280/625] eta 0:03:17 lr 0.000265 wd 0.0500 time 0.5661 (0.5730) data time 0.0008 (0.0022) model time 0.5653 (0.5715) loss 7.4983 (7.1902) grad_norm 3.0521 (2.4394) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:56:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][290/625] eta 0:03:11 lr 0.000265 wd 0.0500 time 0.5695 (0.5729) data time 0.0009 (0.0021) model time 0.5686 (0.5715) loss 7.6740 (7.1918) grad_norm 1.9696 (2.4404) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:56:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][300/625] eta 0:03:06 lr 0.000265 wd 0.0500 time 0.5684 (0.5728) data time 0.0007 (0.0021) model time 0.5676 (0.5714) loss 5.4510 (7.1941) grad_norm 2.1363 (2.4409) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:56:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][310/625] eta 0:03:00 lr 0.000265 wd 0.0500 time 0.5669 (0.5727) data time 0.0008 (0.0021) model time 0.5661 (0.5713) loss 7.0908 (7.2071) grad_norm 2.1220 (2.4396) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:56:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][320/625] eta 0:02:54 lr 0.000265 wd 0.0500 time 0.5670 (0.5726) data time 0.0007 (0.0020) model time 0.5663 (0.5712) loss 8.3021 (7.2046) grad_norm 2.5803 (2.4432) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:56:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][330/625] eta 0:02:48 lr 0.000265 wd 0.0500 time 0.5672 (0.5725) data time 0.0009 (0.0020) model time 0.5662 (0.5711) loss 6.6097 (7.2131) grad_norm 2.4724 (2.4308) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:57:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][340/625] eta 0:02:43 lr 0.000265 wd 0.0500 time 0.5687 (0.5724) data time 0.0006 (0.0019) model time 0.5682 (0.5710) loss 8.5550 (7.2137) grad_norm 1.4991 (2.4248) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:57:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][350/625] eta 0:02:37 lr 0.000265 wd 0.0500 time 0.5714 (0.5734) data time 0.0007 (0.0019) model time 0.5706 (0.5721) loss 6.5850 (7.2103) grad_norm 3.5228 (2.4343) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:57:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][360/625] eta 0:02:31 lr 0.000265 wd 0.0500 time 0.5665 (0.5733) data time 0.0010 (0.0019) model time 0.5655 (0.5720) loss 8.4369 (7.2290) grad_norm 2.5608 (2.4451) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 12:57:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][370/625] eta 0:02:26 lr 0.000264 wd 0.0500 time 0.5702 (0.5734) data time 0.0007 (0.0019) model time 0.5695 (0.5721) loss 6.4439 (7.2248) grad_norm 1.7435 (2.4467) loss_scale 1024.0000 (513.3801) mem 22339MB +[2024-07-25 12:57:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][380/625] eta 0:02:20 lr 0.000264 wd 0.0500 time 0.5685 (0.5732) data time 0.0006 (0.0018) model time 0.5679 (0.5720) loss 6.7739 (7.2149) grad_norm 2.6865 (inf) loss_scale 512.0000 (518.7192) mem 22339MB +[2024-07-25 12:57:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][390/625] eta 0:02:14 lr 0.000264 wd 0.0500 time 0.5682 (0.5732) data time 0.0008 (0.0018) model time 0.5674 (0.5720) loss 7.3338 (7.2150) grad_norm 2.8843 (inf) loss_scale 512.0000 (518.5473) mem 22339MB +[2024-07-25 12:57:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][400/625] eta 0:02:08 lr 0.000264 wd 0.0500 time 0.5653 (0.5732) data time 0.0007 (0.0018) model time 0.5647 (0.5720) loss 6.3894 (7.2092) grad_norm 2.4244 (inf) loss_scale 512.0000 (518.3840) mem 22339MB +[2024-07-25 12:57:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][410/625] eta 0:02:03 lr 0.000264 wd 0.0500 time 0.5703 (0.5732) data time 0.0009 (0.0018) model time 0.5693 (0.5720) loss 7.6076 (7.2166) grad_norm 2.1700 (inf) loss_scale 512.0000 (518.2287) mem 22339MB +[2024-07-25 12:57:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][420/625] eta 0:01:57 lr 0.000264 wd 0.0500 time 0.5654 (0.5731) data time 0.0007 (0.0017) model time 0.5647 (0.5719) loss 6.8211 (7.2120) grad_norm 1.7687 (inf) loss_scale 512.0000 (518.0808) mem 22339MB +[2024-07-25 12:57:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][430/625] eta 0:01:51 lr 0.000264 wd 0.0500 time 0.5693 (0.5730) data time 0.0008 (0.0017) model time 0.5685 (0.5718) loss 7.5056 (7.2090) grad_norm 2.6323 (inf) loss_scale 512.0000 (517.9397) mem 22339MB +[2024-07-25 12:58:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][440/625] eta 0:01:45 lr 0.000264 wd 0.0500 time 0.5640 (0.5730) data time 0.0008 (0.0017) model time 0.5632 (0.5718) loss 7.3112 (7.2168) grad_norm 2.0251 (inf) loss_scale 512.0000 (517.8050) mem 22339MB +[2024-07-25 12:58:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][450/625] eta 0:01:40 lr 0.000264 wd 0.0500 time 0.5681 (0.5729) data time 0.0009 (0.0017) model time 0.5672 (0.5717) loss 7.6404 (7.2038) grad_norm 2.3508 (inf) loss_scale 512.0000 (517.6763) mem 22339MB +[2024-07-25 12:58:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][460/625] eta 0:01:34 lr 0.000264 wd 0.0500 time 0.5683 (0.5729) data time 0.0006 (0.0017) model time 0.5676 (0.5717) loss 5.4525 (7.1957) grad_norm 3.5651 (inf) loss_scale 512.0000 (517.5531) mem 22339MB +[2024-07-25 12:58:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][470/625] eta 0:01:28 lr 0.000264 wd 0.0500 time 0.5595 (0.5733) data time 0.0008 (0.0016) model time 0.5587 (0.5722) loss 7.5639 (7.1999) grad_norm 2.7477 (inf) loss_scale 512.0000 (517.4352) mem 22339MB +[2024-07-25 12:58:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][480/625] eta 0:01:23 lr 0.000264 wd 0.0500 time 0.5650 (0.5733) data time 0.0008 (0.0016) model time 0.5642 (0.5722) loss 8.6086 (7.2016) grad_norm 1.9778 (inf) loss_scale 512.0000 (517.3222) mem 22339MB +[2024-07-25 12:58:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][490/625] eta 0:01:17 lr 0.000263 wd 0.0500 time 0.5710 (0.5733) data time 0.0007 (0.0016) model time 0.5703 (0.5721) loss 8.1293 (7.2030) grad_norm 2.5491 (inf) loss_scale 512.0000 (517.2138) mem 22339MB +[2024-07-25 12:58:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][500/625] eta 0:01:11 lr 0.000263 wd 0.0500 time 0.5670 (0.5733) data time 0.0008 (0.0016) model time 0.5662 (0.5722) loss 7.4378 (7.1929) grad_norm 1.9139 (inf) loss_scale 512.0000 (517.1098) mem 22339MB +[2024-07-25 12:58:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][510/625] eta 0:01:05 lr 0.000263 wd 0.0500 time 0.5628 (0.5733) data time 0.0009 (0.0016) model time 0.5619 (0.5722) loss 6.3289 (7.1924) grad_norm 2.6776 (inf) loss_scale 512.0000 (517.0098) mem 22339MB +[2024-07-25 12:58:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][520/625] eta 0:01:00 lr 0.000263 wd 0.0500 time 0.5612 (0.5732) data time 0.0008 (0.0016) model time 0.5605 (0.5721) loss 6.7233 (7.1920) grad_norm 2.6283 (inf) loss_scale 512.0000 (516.9136) mem 22339MB +[2024-07-25 12:58:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][530/625] eta 0:00:54 lr 0.000263 wd 0.0500 time 0.5686 (0.5732) data time 0.0006 (0.0016) model time 0.5680 (0.5721) loss 6.7946 (7.1919) grad_norm 2.2066 (inf) loss_scale 512.0000 (516.8211) mem 22339MB +[2024-07-25 12:58:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][540/625] eta 0:00:48 lr 0.000263 wd 0.0500 time 0.5700 (0.5732) data time 0.0008 (0.0016) model time 0.5692 (0.5721) loss 6.4293 (7.1974) grad_norm 2.6971 (inf) loss_scale 512.0000 (516.7320) mem 22339MB +[2024-07-25 12:59:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][550/625] eta 0:00:42 lr 0.000263 wd 0.0500 time 0.5686 (0.5732) data time 0.0008 (0.0015) model time 0.5678 (0.5720) loss 7.9404 (7.1994) grad_norm 2.0726 (inf) loss_scale 512.0000 (516.6461) mem 22339MB +[2024-07-25 12:59:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][560/625] eta 0:00:37 lr 0.000263 wd 0.0500 time 0.5693 (0.5731) data time 0.0008 (0.0015) model time 0.5685 (0.5720) loss 8.1395 (7.2117) grad_norm 1.7607 (inf) loss_scale 512.0000 (516.5633) mem 22339MB +[2024-07-25 12:59:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][570/625] eta 0:00:31 lr 0.000263 wd 0.0500 time 0.5696 (0.5735) data time 0.0009 (0.0015) model time 0.5687 (0.5725) loss 6.9891 (7.2110) grad_norm 1.8519 (inf) loss_scale 512.0000 (516.4834) mem 22339MB +[2024-07-25 12:59:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][580/625] eta 0:00:25 lr 0.000263 wd 0.0500 time 0.5703 (0.5735) data time 0.0009 (0.0015) model time 0.5694 (0.5724) loss 8.0590 (7.2100) grad_norm 3.2919 (inf) loss_scale 512.0000 (516.4062) mem 22339MB +[2024-07-25 12:59:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][590/625] eta 0:00:20 lr 0.000263 wd 0.0500 time 0.5691 (0.5735) data time 0.0008 (0.0015) model time 0.5684 (0.5725) loss 5.2515 (7.2026) grad_norm 1.6888 (inf) loss_scale 512.0000 (516.3316) mem 22339MB +[2024-07-25 12:59:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][600/625] eta 0:00:14 lr 0.000262 wd 0.0500 time 0.5699 (0.5735) data time 0.0008 (0.0015) model time 0.5691 (0.5724) loss 7.9480 (7.2113) grad_norm 2.0910 (inf) loss_scale 512.0000 (516.2596) mem 22339MB +[2024-07-25 12:59:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][610/625] eta 0:00:08 lr 0.000262 wd 0.0500 time 0.5698 (0.5734) data time 0.0004 (0.0015) model time 0.5694 (0.5724) loss 6.5177 (7.2086) grad_norm 2.5995 (inf) loss_scale 512.0000 (516.1899) mem 22339MB +[2024-07-25 12:59:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [214/300][620/625] eta 0:00:02 lr 0.000262 wd 0.0500 time 0.5675 (0.5734) data time 0.0004 (0.0015) model time 0.5670 (0.5723) loss 5.6946 (7.2148) grad_norm 1.6534 (inf) loss_scale 512.0000 (516.1224) mem 22339MB +[2024-07-25 12:59:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 214 training takes 0:05:58 +[2024-07-25 12:59:47 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 12:59:49 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 12:59:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.478 (0.478) Loss 0.5005 (0.5005) Acc@1 90.674 (90.674) Acc@5 98.828 (98.828) Mem 22339MB +[2024-07-25 12:59:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.7612 (0.6134) Acc@1 82.324 (87.385) Acc@5 96.680 (97.923) Mem 22339MB +[2024-07-25 12:59:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8691 (0.7085) Acc@1 79.150 (84.628) Acc@5 95.850 (97.052) Mem 22339MB +[2024-07-25 12:59:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.267 Acc@5 97.073 +[2024-07-25 12:59:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.3% +[2024-07-25 12:59:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.911 (0.911) Loss 0.5049 (0.5049) Acc@1 90.430 (90.430) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 12:59:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.197) Loss 0.7485 (0.6204) Acc@1 83.301 (87.589) Acc@5 96.875 (98.029) Mem 22339MB +[2024-07-25 12:59:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.163) Loss 0.8633 (0.7101) Acc@1 78.857 (84.689) Acc@5 96.045 (97.180) Mem 22339MB +[2024-07-25 12:59:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.323 Acc@5 97.163 +[2024-07-25 12:59:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.3% +[2024-07-25 12:59:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.32% +[2024-07-25 12:59:56 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 12:59:58 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 12:59:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][0/625] eta 0:08:35 lr 0.000262 wd 0.0500 time 0.8252 (0.8252) data time 0.3070 (0.3070) model time 0.0000 (0.0000) loss 6.6064 (6.6064) grad_norm 3.5587 (3.5587) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:00:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][10/625] eta 0:06:04 lr 0.000262 wd 0.0500 time 0.5686 (0.5927) data time 0.0006 (0.0295) model time 0.0000 (0.0000) loss 7.7792 (7.5691) grad_norm 2.2015 (2.3465) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:00:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][20/625] eta 0:05:52 lr 0.000262 wd 0.0500 time 0.5658 (0.5832) data time 0.0007 (0.0159) model time 0.0000 (0.0000) loss 8.2731 (7.4699) grad_norm 3.2561 (2.4291) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:00:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][30/625] eta 0:05:44 lr 0.000262 wd 0.0500 time 0.5716 (0.5792) data time 0.0008 (0.0110) model time 0.0000 (0.0000) loss 6.5876 (7.3699) grad_norm 3.4053 (2.8888) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:00:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][40/625] eta 0:05:37 lr 0.000262 wd 0.0500 time 0.5680 (0.5767) data time 0.0007 (0.0085) model time 0.0000 (0.0000) loss 8.2643 (7.4135) grad_norm 5.6396 (2.9573) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:00:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][50/625] eta 0:05:30 lr 0.000262 wd 0.0500 time 0.5698 (0.5754) data time 0.0008 (0.0070) model time 0.0000 (0.0000) loss 7.7535 (7.4516) grad_norm 2.3197 (2.7988) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:00:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][60/625] eta 0:05:24 lr 0.000262 wd 0.0500 time 0.5690 (0.5749) data time 0.0006 (0.0060) model time 0.5684 (0.5718) loss 8.6446 (7.4848) grad_norm 2.8020 (2.7821) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:00:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][70/625] eta 0:05:19 lr 0.000262 wd 0.0500 time 0.5626 (0.5748) data time 0.0007 (0.0053) model time 0.5619 (0.5726) loss 8.2610 (7.4810) grad_norm 4.1278 (2.8109) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:00:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][80/625] eta 0:05:13 lr 0.000262 wd 0.0500 time 0.5694 (0.5746) data time 0.0008 (0.0047) model time 0.5686 (0.5725) loss 7.8101 (7.4553) grad_norm 2.2945 (2.7931) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:00:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][90/625] eta 0:05:07 lr 0.000261 wd 0.0500 time 0.5666 (0.5741) data time 0.0008 (0.0043) model time 0.5658 (0.5716) loss 8.8190 (7.4641) grad_norm 5.0920 (2.8282) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:00:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][100/625] eta 0:05:01 lr 0.000261 wd 0.0500 time 0.5650 (0.5736) data time 0.0008 (0.0039) model time 0.5643 (0.5709) loss 6.8637 (7.4447) grad_norm 2.1396 (2.8138) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:01:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][110/625] eta 0:04:55 lr 0.000261 wd 0.0500 time 0.5677 (0.5732) data time 0.0008 (0.0037) model time 0.5669 (0.5705) loss 7.7432 (7.4158) grad_norm 5.1266 (2.8166) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:01:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][120/625] eta 0:04:49 lr 0.000261 wd 0.0500 time 0.5653 (0.5739) data time 0.0010 (0.0034) model time 0.5643 (0.5719) loss 7.7946 (7.4120) grad_norm 2.9815 (2.8067) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:01:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][130/625] eta 0:04:44 lr 0.000261 wd 0.0500 time 0.5679 (0.5740) data time 0.0008 (0.0032) model time 0.5671 (0.5723) loss 6.2349 (7.3696) grad_norm 2.1463 (2.8220) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:01:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][140/625] eta 0:04:38 lr 0.000261 wd 0.0500 time 0.5659 (0.5741) data time 0.0006 (0.0031) model time 0.5653 (0.5725) loss 7.0896 (7.3588) grad_norm 3.5287 (2.8904) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:01:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][150/625] eta 0:04:32 lr 0.000261 wd 0.0500 time 0.5710 (0.5740) data time 0.0006 (0.0029) model time 0.5704 (0.5724) loss 8.2516 (7.3578) grad_norm 2.1005 (2.8800) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:01:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][160/625] eta 0:04:27 lr 0.000261 wd 0.0500 time 0.5671 (0.5751) data time 0.0006 (0.0028) model time 0.5665 (0.5742) loss 8.3357 (7.3580) grad_norm 2.7116 (2.8529) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:01:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][170/625] eta 0:04:21 lr 0.000261 wd 0.0500 time 0.5670 (0.5755) data time 0.0009 (0.0027) model time 0.5661 (0.5747) loss 6.7321 (7.3497) grad_norm 2.7099 (2.8256) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:01:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][180/625] eta 0:04:15 lr 0.000261 wd 0.0500 time 0.5680 (0.5752) data time 0.0008 (0.0026) model time 0.5673 (0.5743) loss 7.2912 (7.3428) grad_norm 2.9651 (2.8505) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:01:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][190/625] eta 0:04:10 lr 0.000261 wd 0.0500 time 0.5665 (0.5748) data time 0.0006 (0.0025) model time 0.5660 (0.5738) loss 7.3604 (7.3491) grad_norm 4.9880 (2.9111) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:01:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][200/625] eta 0:04:04 lr 0.000261 wd 0.0500 time 0.5658 (0.5745) data time 0.0007 (0.0024) model time 0.5652 (0.5734) loss 7.2035 (7.3218) grad_norm 2.4587 (2.8870) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:01:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][210/625] eta 0:03:58 lr 0.000260 wd 0.0500 time 0.5702 (0.5744) data time 0.0008 (0.0023) model time 0.5694 (0.5732) loss 6.8333 (7.3100) grad_norm 2.8853 (2.8749) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:02:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][220/625] eta 0:03:52 lr 0.000260 wd 0.0500 time 0.5684 (0.5742) data time 0.0005 (0.0023) model time 0.5679 (0.5731) loss 7.2247 (7.2966) grad_norm 3.9616 (2.8498) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:02:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][230/625] eta 0:03:46 lr 0.000260 wd 0.0500 time 0.5710 (0.5741) data time 0.0007 (0.0022) model time 0.5704 (0.5729) loss 8.6463 (7.3135) grad_norm 2.9992 (2.8307) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:02:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][240/625] eta 0:03:40 lr 0.000260 wd 0.0500 time 0.5628 (0.5740) data time 0.0007 (0.0021) model time 0.5621 (0.5728) loss 7.1927 (7.3217) grad_norm 2.1840 (2.8156) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:02:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][250/625] eta 0:03:35 lr 0.000260 wd 0.0500 time 0.5683 (0.5738) data time 0.0007 (0.0021) model time 0.5676 (0.5726) loss 7.8528 (7.3102) grad_norm 2.8769 (2.8013) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:02:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][260/625] eta 0:03:29 lr 0.000260 wd 0.0500 time 0.5614 (0.5739) data time 0.0006 (0.0020) model time 0.5607 (0.5728) loss 7.0584 (7.3030) grad_norm 2.2699 (2.8037) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:02:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][270/625] eta 0:03:23 lr 0.000260 wd 0.0500 time 0.5713 (0.5738) data time 0.0007 (0.0020) model time 0.5706 (0.5726) loss 6.3145 (7.3000) grad_norm 3.2338 (2.8432) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:02:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][280/625] eta 0:03:17 lr 0.000260 wd 0.0500 time 0.5699 (0.5737) data time 0.0006 (0.0019) model time 0.5693 (0.5725) loss 8.0762 (7.2933) grad_norm 1.6303 (2.8339) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:02:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][290/625] eta 0:03:12 lr 0.000260 wd 0.0500 time 0.5691 (0.5736) data time 0.0006 (0.0019) model time 0.5685 (0.5724) loss 6.8939 (7.2974) grad_norm 3.4799 (2.8259) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:02:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][300/625] eta 0:03:06 lr 0.000260 wd 0.0500 time 0.5694 (0.5735) data time 0.0006 (0.0019) model time 0.5688 (0.5723) loss 6.2211 (7.2955) grad_norm 2.2252 (2.8089) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:02:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][310/625] eta 0:03:00 lr 0.000260 wd 0.0500 time 0.5701 (0.5734) data time 0.0007 (0.0018) model time 0.5694 (0.5722) loss 7.9230 (7.3049) grad_norm 2.6930 (2.7870) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:03:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][320/625] eta 0:02:54 lr 0.000259 wd 0.0500 time 0.5658 (0.5733) data time 0.0006 (0.0018) model time 0.5652 (0.5721) loss 6.1863 (7.2925) grad_norm 1.9759 (2.7641) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:03:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][330/625] eta 0:02:49 lr 0.000259 wd 0.0500 time 0.5698 (0.5733) data time 0.0006 (0.0018) model time 0.5692 (0.5721) loss 7.0110 (7.2815) grad_norm 1.7141 (2.7513) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:03:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][340/625] eta 0:02:43 lr 0.000259 wd 0.0500 time 0.5694 (0.5734) data time 0.0007 (0.0017) model time 0.5688 (0.5722) loss 6.6730 (7.2841) grad_norm 2.9378 (2.7541) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:03:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][350/625] eta 0:02:37 lr 0.000259 wd 0.0500 time 0.5705 (0.5733) data time 0.0006 (0.0017) model time 0.5699 (0.5721) loss 5.9381 (7.2854) grad_norm 1.8140 (2.7602) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:03:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][360/625] eta 0:02:31 lr 0.000259 wd 0.0500 time 0.5691 (0.5732) data time 0.0009 (0.0017) model time 0.5683 (0.5721) loss 6.5973 (7.2976) grad_norm 1.9345 (2.7393) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:03:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][370/625] eta 0:02:26 lr 0.000259 wd 0.0500 time 0.5717 (0.5732) data time 0.0009 (0.0017) model time 0.5708 (0.5720) loss 8.5757 (7.2857) grad_norm 2.8875 (2.7303) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:03:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][380/625] eta 0:02:20 lr 0.000259 wd 0.0500 time 0.5672 (0.5735) data time 0.0006 (0.0017) model time 0.5666 (0.5724) loss 7.1527 (7.2795) grad_norm 1.6545 (2.7106) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:03:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][390/625] eta 0:02:14 lr 0.000259 wd 0.0500 time 0.5687 (0.5736) data time 0.0008 (0.0016) model time 0.5678 (0.5726) loss 7.2061 (7.2887) grad_norm 2.9417 (2.7093) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:03:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][400/625] eta 0:02:09 lr 0.000259 wd 0.0500 time 0.5675 (0.5735) data time 0.0008 (0.0016) model time 0.5668 (0.5724) loss 6.4206 (7.2839) grad_norm 2.7877 (2.7019) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:03:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][410/625] eta 0:02:03 lr 0.000259 wd 0.0500 time 0.5688 (0.5734) data time 0.0008 (0.0016) model time 0.5680 (0.5723) loss 7.0743 (7.2806) grad_norm 2.1268 (2.6917) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:03:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][420/625] eta 0:01:57 lr 0.000259 wd 0.0500 time 0.5645 (0.5733) data time 0.0007 (0.0016) model time 0.5638 (0.5722) loss 7.5289 (7.2746) grad_norm 1.7197 (2.6755) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:04:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][430/625] eta 0:01:51 lr 0.000259 wd 0.0500 time 0.5658 (0.5733) data time 0.0009 (0.0016) model time 0.5649 (0.5721) loss 7.0206 (7.2822) grad_norm 2.1352 (2.6641) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:04:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][440/625] eta 0:01:46 lr 0.000258 wd 0.0500 time 0.5683 (0.5732) data time 0.0008 (0.0015) model time 0.5675 (0.5721) loss 8.1581 (7.2872) grad_norm 2.3493 (2.6678) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:04:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][450/625] eta 0:01:40 lr 0.000258 wd 0.0500 time 0.5676 (0.5732) data time 0.0006 (0.0015) model time 0.5669 (0.5721) loss 7.7269 (7.2796) grad_norm 2.2701 (2.6600) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:04:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][460/625] eta 0:01:34 lr 0.000258 wd 0.0500 time 0.5651 (0.5731) data time 0.0009 (0.0015) model time 0.5643 (0.5720) loss 8.0810 (7.2903) grad_norm 2.0474 (2.6486) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:04:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][470/625] eta 0:01:28 lr 0.000258 wd 0.0500 time 0.5659 (0.5730) data time 0.0010 (0.0015) model time 0.5649 (0.5719) loss 8.1855 (7.2947) grad_norm 3.2876 (2.6510) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:04:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][480/625] eta 0:01:23 lr 0.000258 wd 0.0500 time 0.5687 (0.5729) data time 0.0009 (0.0015) model time 0.5679 (0.5718) loss 6.3502 (7.2903) grad_norm 2.4414 (2.6422) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:04:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][490/625] eta 0:01:17 lr 0.000258 wd 0.0500 time 0.5712 (0.5729) data time 0.0010 (0.0015) model time 0.5703 (0.5718) loss 7.4886 (7.2901) grad_norm 2.0058 (2.6743) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:04:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][500/625] eta 0:01:11 lr 0.000258 wd 0.0500 time 0.5675 (0.5728) data time 0.0009 (0.0015) model time 0.5666 (0.5717) loss 6.6483 (7.2863) grad_norm 2.3436 (2.6788) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:04:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][510/625] eta 0:01:05 lr 0.000258 wd 0.0500 time 0.5716 (0.5728) data time 0.0008 (0.0014) model time 0.5708 (0.5717) loss 7.1995 (7.2895) grad_norm 1.9317 (2.6692) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:04:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][520/625] eta 0:01:00 lr 0.000258 wd 0.0500 time 0.5621 (0.5727) data time 0.0008 (0.0014) model time 0.5613 (0.5716) loss 7.8713 (7.2829) grad_norm 2.0916 (2.6621) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:05:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][530/625] eta 0:00:54 lr 0.000258 wd 0.0500 time 0.5665 (0.5727) data time 0.0007 (0.0014) model time 0.5658 (0.5716) loss 8.0887 (7.2857) grad_norm 1.6966 (2.6581) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:05:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][540/625] eta 0:00:48 lr 0.000258 wd 0.0500 time 0.5652 (0.5726) data time 0.0006 (0.0014) model time 0.5646 (0.5715) loss 6.8097 (7.2820) grad_norm 1.8661 (2.6495) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:05:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][550/625] eta 0:00:42 lr 0.000258 wd 0.0500 time 0.5691 (0.5726) data time 0.0006 (0.0014) model time 0.5685 (0.5714) loss 7.1734 (7.2888) grad_norm 3.5202 (2.6519) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:05:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][560/625] eta 0:00:37 lr 0.000257 wd 0.0500 time 0.5666 (0.5728) data time 0.0008 (0.0014) model time 0.5658 (0.5717) loss 8.1558 (7.2918) grad_norm 1.7921 (2.6524) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:05:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][570/625] eta 0:00:31 lr 0.000257 wd 0.0500 time 0.5709 (0.5728) data time 0.0008 (0.0014) model time 0.5700 (0.5717) loss 7.0870 (7.2966) grad_norm 2.3940 (2.6685) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:05:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][580/625] eta 0:00:25 lr 0.000257 wd 0.0500 time 0.5688 (0.5727) data time 0.0008 (0.0014) model time 0.5680 (0.5717) loss 8.5710 (7.3053) grad_norm 2.1402 (2.6622) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:05:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][590/625] eta 0:00:20 lr 0.000257 wd 0.0500 time 0.5696 (0.5727) data time 0.0006 (0.0014) model time 0.5689 (0.5716) loss 7.3728 (7.3062) grad_norm 1.6305 (2.6556) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:05:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][600/625] eta 0:00:14 lr 0.000257 wd 0.0500 time 0.7626 (0.5730) data time 0.0007 (0.0013) model time 0.7619 (0.5720) loss 7.0736 (7.3047) grad_norm 2.9559 (2.6547) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:05:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][610/625] eta 0:00:08 lr 0.000257 wd 0.0500 time 0.5618 (0.5731) data time 0.0005 (0.0013) model time 0.5613 (0.5720) loss 7.7656 (7.3122) grad_norm 2.3443 (2.6520) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:05:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [215/300][620/625] eta 0:00:02 lr 0.000257 wd 0.0500 time 0.5660 (0.5730) data time 0.0004 (0.0013) model time 0.5656 (0.5720) loss 5.9883 (7.3091) grad_norm 4.9656 (2.6512) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:05:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 215 training takes 0:05:58 +[2024-07-25 13:05:56 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 13:05:57 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 13:05:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.475 (0.475) Loss 0.5093 (0.5093) Acc@1 89.941 (89.941) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-25 13:05:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.158) Loss 0.7715 (0.6222) Acc@1 82.227 (87.469) Acc@5 96.875 (97.994) Mem 22339MB +[2024-07-25 13:06:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8755 (0.7140) Acc@1 79.492 (84.663) Acc@5 95.801 (97.135) Mem 22339MB +[2024-07-25 13:06:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.283 Acc@5 97.129 +[2024-07-25 13:06:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.3% +[2024-07-25 13:06:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.874 (0.874) Loss 0.5054 (0.5054) Acc@1 90.430 (90.430) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 13:06:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.194) Loss 0.7485 (0.6204) Acc@1 83.203 (87.584) Acc@5 96.826 (98.029) Mem 22339MB +[2024-07-25 13:06:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.161) Loss 0.8628 (0.7099) Acc@1 78.906 (84.689) Acc@5 96.045 (97.189) Mem 22339MB +[2024-07-25 13:06:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.317 Acc@5 97.177 +[2024-07-25 13:06:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.3% +[2024-07-25 13:06:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][0/625] eta 0:14:56 lr 0.000257 wd 0.0500 time 1.4342 (1.4342) data time 0.7404 (0.7404) model time 0.0000 (0.0000) loss 6.6050 (6.6050) grad_norm 2.5356 (2.5356) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:06:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][10/625] eta 0:06:38 lr 0.000257 wd 0.0500 time 0.5715 (0.6487) data time 0.0008 (0.0681) model time 0.0000 (0.0000) loss 7.1234 (7.4754) grad_norm 1.9214 (2.4442) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:06:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][20/625] eta 0:06:10 lr 0.000257 wd 0.0500 time 0.5695 (0.6122) data time 0.0008 (0.0360) model time 0.0000 (0.0000) loss 7.5384 (7.1858) grad_norm 1.6879 (2.4415) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:06:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][30/625] eta 0:05:56 lr 0.000257 wd 0.0500 time 0.5734 (0.5987) data time 0.0007 (0.0247) model time 0.0000 (0.0000) loss 7.7766 (7.2217) grad_norm 4.2419 (2.6259) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:06:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][40/625] eta 0:05:46 lr 0.000257 wd 0.0500 time 0.5708 (0.5920) data time 0.0007 (0.0189) model time 0.0000 (0.0000) loss 8.2401 (7.0993) grad_norm 2.4624 (2.7638) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:06:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][50/625] eta 0:05:37 lr 0.000256 wd 0.0500 time 0.5710 (0.5877) data time 0.0006 (0.0153) model time 0.0000 (0.0000) loss 6.5610 (7.2096) grad_norm 1.9980 (2.8445) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:06:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][60/625] eta 0:05:30 lr 0.000256 wd 0.0500 time 0.5699 (0.5848) data time 0.0008 (0.0131) model time 0.5691 (0.5684) loss 6.4824 (7.1822) grad_norm 2.3681 (2.7833) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:06:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][70/625] eta 0:05:23 lr 0.000256 wd 0.0500 time 0.5720 (0.5827) data time 0.0006 (0.0114) model time 0.5714 (0.5685) loss 6.1903 (7.1338) grad_norm 1.9382 (2.7484) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:06:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][80/625] eta 0:05:17 lr 0.000256 wd 0.0500 time 0.5736 (0.5820) data time 0.0006 (0.0101) model time 0.5731 (0.5711) loss 6.1566 (7.1558) grad_norm 2.6114 (2.7498) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:06:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][90/625] eta 0:05:10 lr 0.000256 wd 0.0500 time 0.5723 (0.5809) data time 0.0006 (0.0091) model time 0.5717 (0.5712) loss 5.6134 (7.1690) grad_norm 2.6708 (2.7336) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:07:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][100/625] eta 0:05:04 lr 0.000256 wd 0.0500 time 0.5908 (0.5809) data time 0.0006 (0.0082) model time 0.5902 (0.5730) loss 6.4598 (7.1683) grad_norm 2.6119 (2.7314) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:07:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][110/625] eta 0:04:58 lr 0.000256 wd 0.0500 time 0.5719 (0.5800) data time 0.0007 (0.0076) model time 0.5713 (0.5725) loss 6.5094 (7.1756) grad_norm 2.3673 (2.6942) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:07:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][120/625] eta 0:04:52 lr 0.000256 wd 0.0500 time 0.5687 (0.5791) data time 0.0007 (0.0070) model time 0.5680 (0.5719) loss 7.3416 (7.1741) grad_norm 3.0958 (2.6543) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:07:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][130/625] eta 0:04:46 lr 0.000256 wd 0.0500 time 0.5718 (0.5791) data time 0.0008 (0.0066) model time 0.5710 (0.5727) loss 7.6284 (7.1554) grad_norm 2.7922 (2.6342) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:07:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][140/625] eta 0:04:40 lr 0.000256 wd 0.0500 time 0.5721 (0.5786) data time 0.0009 (0.0062) model time 0.5711 (0.5725) loss 8.0344 (7.1179) grad_norm 1.7117 (2.6982) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:07:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][150/625] eta 0:04:34 lr 0.000256 wd 0.0500 time 0.5723 (0.5781) data time 0.0009 (0.0058) model time 0.5715 (0.5723) loss 6.3476 (7.1207) grad_norm 2.7086 (2.7065) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:07:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][160/625] eta 0:04:28 lr 0.000255 wd 0.0500 time 0.5708 (0.5777) data time 0.0007 (0.0055) model time 0.5701 (0.5722) loss 7.1314 (7.1246) grad_norm 2.1443 (2.7165) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:07:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][170/625] eta 0:04:22 lr 0.000255 wd 0.0500 time 0.5746 (0.5773) data time 0.0006 (0.0052) model time 0.5740 (0.5720) loss 6.7463 (7.1067) grad_norm 2.4630 (2.6988) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:07:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][180/625] eta 0:04:16 lr 0.000255 wd 0.0500 time 0.5691 (0.5770) data time 0.0009 (0.0050) model time 0.5682 (0.5719) loss 7.3917 (7.1184) grad_norm 2.0135 (2.6917) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:07:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][190/625] eta 0:04:10 lr 0.000255 wd 0.0500 time 0.5765 (0.5767) data time 0.0008 (0.0048) model time 0.5757 (0.5718) loss 7.0794 (7.1138) grad_norm 1.8812 (2.6709) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:08:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][200/625] eta 0:04:05 lr 0.000255 wd 0.0500 time 0.5695 (0.5781) data time 0.0009 (0.0046) model time 0.5685 (0.5740) loss 8.8911 (7.1351) grad_norm 2.5195 (2.6620) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:08:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][210/625] eta 0:03:59 lr 0.000255 wd 0.0500 time 0.5704 (0.5777) data time 0.0008 (0.0044) model time 0.5695 (0.5736) loss 6.7232 (7.1498) grad_norm 1.9568 (2.6411) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:08:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][220/625] eta 0:03:53 lr 0.000255 wd 0.0500 time 0.5728 (0.5774) data time 0.0006 (0.0042) model time 0.5722 (0.5734) loss 7.3513 (7.1694) grad_norm 2.9249 (2.6267) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:08:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][230/625] eta 0:03:47 lr 0.000255 wd 0.0500 time 0.5725 (0.5771) data time 0.0009 (0.0041) model time 0.5716 (0.5732) loss 6.1276 (7.1634) grad_norm 1.9130 (2.6255) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:08:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][240/625] eta 0:03:42 lr 0.000255 wd 0.0500 time 0.5699 (0.5769) data time 0.0007 (0.0039) model time 0.5692 (0.5731) loss 6.5644 (7.1632) grad_norm 2.2376 (2.6047) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:08:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][250/625] eta 0:03:36 lr 0.000255 wd 0.0500 time 0.5724 (0.5767) data time 0.0008 (0.0038) model time 0.5715 (0.5730) loss 8.0281 (7.1629) grad_norm 2.1431 (2.5840) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:08:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][260/625] eta 0:03:30 lr 0.000255 wd 0.0500 time 0.5701 (0.5764) data time 0.0006 (0.0037) model time 0.5695 (0.5728) loss 6.0694 (7.1715) grad_norm 1.8640 (2.5623) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:08:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][270/625] eta 0:03:24 lr 0.000255 wd 0.0500 time 0.5702 (0.5762) data time 0.0008 (0.0036) model time 0.5694 (0.5727) loss 7.0025 (7.1785) grad_norm 2.2367 (2.5658) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:08:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][280/625] eta 0:03:18 lr 0.000254 wd 0.0500 time 0.5695 (0.5760) data time 0.0006 (0.0035) model time 0.5690 (0.5725) loss 7.6299 (7.1851) grad_norm 2.2358 (2.5939) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:08:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][290/625] eta 0:03:12 lr 0.000254 wd 0.0500 time 0.5709 (0.5757) data time 0.0007 (0.0034) model time 0.5703 (0.5723) loss 6.6738 (7.1769) grad_norm 2.7124 (2.5829) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:08:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][300/625] eta 0:03:07 lr 0.000254 wd 0.0500 time 0.5734 (0.5756) data time 0.0006 (0.0033) model time 0.5728 (0.5722) loss 6.2873 (7.1697) grad_norm 2.3890 (2.5793) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:09:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][310/625] eta 0:03:01 lr 0.000254 wd 0.0500 time 0.5703 (0.5754) data time 0.0006 (0.0032) model time 0.5697 (0.5721) loss 7.4738 (7.1633) grad_norm 3.0267 (2.6025) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:09:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][320/625] eta 0:02:55 lr 0.000254 wd 0.0500 time 0.5693 (0.5752) data time 0.0006 (0.0032) model time 0.5687 (0.5720) loss 6.0206 (7.1726) grad_norm 3.2774 (2.6038) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:09:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][330/625] eta 0:02:49 lr 0.000254 wd 0.0500 time 0.5706 (0.5751) data time 0.0008 (0.0031) model time 0.5698 (0.5720) loss 7.2301 (7.1797) grad_norm 1.9063 (2.5960) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:09:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][340/625] eta 0:02:43 lr 0.000254 wd 0.0500 time 0.5715 (0.5750) data time 0.0008 (0.0030) model time 0.5707 (0.5719) loss 7.0903 (7.1858) grad_norm 2.0758 (2.5906) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:09:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][350/625] eta 0:02:38 lr 0.000254 wd 0.0500 time 0.5687 (0.5752) data time 0.0008 (0.0030) model time 0.5679 (0.5722) loss 8.6757 (7.1917) grad_norm 2.3691 (2.5982) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:09:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][360/625] eta 0:02:32 lr 0.000254 wd 0.0500 time 0.5748 (0.5751) data time 0.0008 (0.0029) model time 0.5740 (0.5721) loss 8.6367 (7.1885) grad_norm 2.2808 (2.6000) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:09:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][370/625] eta 0:02:26 lr 0.000254 wd 0.0500 time 0.5710 (0.5749) data time 0.0008 (0.0029) model time 0.5702 (0.5720) loss 7.7448 (7.1871) grad_norm 2.5521 (2.5913) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:09:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][380/625] eta 0:02:20 lr 0.000254 wd 0.0500 time 0.5792 (0.5749) data time 0.0009 (0.0028) model time 0.5782 (0.5720) loss 6.5458 (7.1968) grad_norm 2.1654 (2.5929) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:09:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][390/625] eta 0:02:15 lr 0.000253 wd 0.0500 time 0.5737 (0.5748) data time 0.0008 (0.0028) model time 0.5729 (0.5720) loss 8.0257 (7.1986) grad_norm 2.7145 (2.5963) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:09:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][400/625] eta 0:02:09 lr 0.000253 wd 0.0500 time 0.5693 (0.5747) data time 0.0007 (0.0027) model time 0.5686 (0.5719) loss 7.0958 (7.1919) grad_norm 2.0205 (2.5965) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:10:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][410/625] eta 0:02:03 lr 0.000253 wd 0.0500 time 0.5688 (0.5745) data time 0.0007 (0.0027) model time 0.5680 (0.5718) loss 8.9417 (7.1929) grad_norm 2.5584 (2.6027) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:10:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][420/625] eta 0:01:57 lr 0.000253 wd 0.0500 time 0.5722 (0.5753) data time 0.0007 (0.0026) model time 0.5715 (0.5727) loss 6.9226 (7.1861) grad_norm 1.8110 (2.5902) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:10:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][430/625] eta 0:01:52 lr 0.000253 wd 0.0500 time 0.5696 (0.5752) data time 0.0006 (0.0026) model time 0.5689 (0.5726) loss 6.9535 (7.1863) grad_norm 2.6064 (2.5868) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:10:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][440/625] eta 0:01:46 lr 0.000253 wd 0.0500 time 0.5760 (0.5751) data time 0.0008 (0.0025) model time 0.5752 (0.5726) loss 6.3703 (7.1862) grad_norm 2.0176 (2.5910) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:10:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][450/625] eta 0:01:40 lr 0.000253 wd 0.0500 time 0.5734 (0.5750) data time 0.0008 (0.0025) model time 0.5726 (0.5725) loss 6.7429 (7.1923) grad_norm 1.9855 (2.5813) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:10:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][460/625] eta 0:01:34 lr 0.000253 wd 0.0500 time 0.5701 (0.5749) data time 0.0008 (0.0025) model time 0.5693 (0.5725) loss 8.2802 (7.2015) grad_norm 2.3981 (2.5779) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:10:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][470/625] eta 0:01:29 lr 0.000253 wd 0.0500 time 0.5682 (0.5750) data time 0.0006 (0.0024) model time 0.5676 (0.5726) loss 7.4070 (7.2021) grad_norm 1.7275 (2.5672) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:10:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][480/625] eta 0:01:23 lr 0.000253 wd 0.0500 time 0.5757 (0.5749) data time 0.0006 (0.0024) model time 0.5751 (0.5725) loss 6.0239 (7.2065) grad_norm 2.1439 (2.5542) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:10:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][490/625] eta 0:01:17 lr 0.000253 wd 0.0500 time 0.5715 (0.5748) data time 0.0012 (0.0024) model time 0.5702 (0.5724) loss 7.3193 (7.2080) grad_norm 1.9697 (2.5479) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:10:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][500/625] eta 0:01:11 lr 0.000253 wd 0.0500 time 0.5805 (0.5747) data time 0.0008 (0.0023) model time 0.5797 (0.5724) loss 7.2937 (7.2094) grad_norm 3.6537 (2.5462) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:10:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][510/625] eta 0:01:06 lr 0.000252 wd 0.0500 time 0.5692 (0.5746) data time 0.0008 (0.0023) model time 0.5684 (0.5723) loss 8.2684 (7.2178) grad_norm 2.4363 (2.5351) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:11:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][520/625] eta 0:01:00 lr 0.000252 wd 0.0500 time 0.5697 (0.5746) data time 0.0008 (0.0023) model time 0.5689 (0.5723) loss 8.1371 (7.2362) grad_norm 2.0013 (2.5239) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:11:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][530/625] eta 0:00:54 lr 0.000252 wd 0.0500 time 0.5732 (0.5746) data time 0.0008 (0.0022) model time 0.5724 (0.5723) loss 8.6042 (7.2360) grad_norm 2.0898 (2.5168) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:11:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][540/625] eta 0:00:48 lr 0.000252 wd 0.0500 time 0.5733 (0.5746) data time 0.0006 (0.0022) model time 0.5728 (0.5723) loss 6.3811 (7.2343) grad_norm 2.7280 (2.5207) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:11:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][550/625] eta 0:00:43 lr 0.000252 wd 0.0500 time 0.5693 (0.5746) data time 0.0008 (0.0022) model time 0.5685 (0.5724) loss 6.6014 (7.2342) grad_norm 1.5610 (2.5163) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:11:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][560/625] eta 0:00:37 lr 0.000252 wd 0.0500 time 0.5702 (0.5745) data time 0.0009 (0.0022) model time 0.5693 (0.5723) loss 6.8395 (7.2325) grad_norm 1.9024 (2.5154) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:11:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][570/625] eta 0:00:31 lr 0.000252 wd 0.0500 time 0.5810 (0.5747) data time 0.0006 (0.0022) model time 0.5804 (0.5725) loss 7.7948 (7.2351) grad_norm 1.7023 (2.5103) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:11:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][580/625] eta 0:00:25 lr 0.000252 wd 0.0500 time 0.5702 (0.5747) data time 0.0006 (0.0021) model time 0.5696 (0.5726) loss 7.7906 (7.2354) grad_norm 7.7566 (2.5155) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:11:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][590/625] eta 0:00:20 lr 0.000252 wd 0.0500 time 0.5801 (0.5746) data time 0.0008 (0.0021) model time 0.5793 (0.5726) loss 6.5097 (7.2416) grad_norm 2.9825 (2.5193) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:11:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][600/625] eta 0:00:14 lr 0.000252 wd 0.0500 time 0.5720 (0.5746) data time 0.0008 (0.0021) model time 0.5712 (0.5725) loss 6.0635 (7.2450) grad_norm 2.2000 (2.5248) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:11:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][610/625] eta 0:00:08 lr 0.000252 wd 0.0500 time 0.5698 (0.5745) data time 0.0006 (0.0021) model time 0.5692 (0.5725) loss 7.9207 (7.2427) grad_norm 2.2711 (2.5323) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:12:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [216/300][620/625] eta 0:00:02 lr 0.000252 wd 0.0500 time 0.5696 (0.5745) data time 0.0006 (0.0020) model time 0.5690 (0.5724) loss 7.9778 (7.2352) grad_norm 2.0279 (2.5291) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:12:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 216 training takes 0:05:59 +[2024-07-25 13:12:04 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 13:12:05 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 13:12:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.470 (0.470) Loss 0.5098 (0.5098) Acc@1 89.941 (89.941) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-25 13:12:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.157) Loss 0.7593 (0.6139) Acc@1 83.154 (87.620) Acc@5 96.777 (98.034) Mem 22339MB +[2024-07-25 13:12:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.8701 (0.7081) Acc@1 79.297 (84.768) Acc@5 96.045 (97.121) Mem 22339MB +[2024-07-25 13:12:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.389 Acc@5 97.131 +[2024-07-25 13:12:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.4% +[2024-07-25 13:12:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 84.39% +[2024-07-25 13:12:09 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 13:12:10 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 13:12:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.499 (0.499) Loss 0.5054 (0.5054) Acc@1 90.430 (90.430) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 13:12:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.160) Loss 0.7485 (0.6201) Acc@1 83.252 (87.593) Acc@5 96.777 (98.011) Mem 22339MB +[2024-07-25 13:12:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.8623 (0.7094) Acc@1 78.857 (84.721) Acc@5 96.045 (97.187) Mem 22339MB +[2024-07-25 13:12:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.355 Acc@5 97.175 +[2024-07-25 13:12:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.4% +[2024-07-25 13:12:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.36% +[2024-07-25 13:12:13 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 13:12:15 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 13:12:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][0/625] eta 0:11:00 lr 0.000251 wd 0.0500 time 1.0568 (1.0568) data time 0.5377 (0.5377) model time 0.0000 (0.0000) loss 7.4375 (7.4375) grad_norm 1.9447 (1.9447) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:12:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][10/625] eta 0:06:28 lr 0.000251 wd 0.0500 time 0.7633 (0.6321) data time 0.0006 (0.0496) model time 0.0000 (0.0000) loss 7.9656 (7.4077) grad_norm 1.8585 (1.9289) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:12:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][20/625] eta 0:06:08 lr 0.000251 wd 0.0500 time 0.5690 (0.6086) data time 0.0006 (0.0263) model time 0.0000 (0.0000) loss 7.8705 (7.3059) grad_norm 1.7902 (1.9492) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:12:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][30/625] eta 0:05:55 lr 0.000251 wd 0.0500 time 0.5690 (0.5972) data time 0.0006 (0.0181) model time 0.0000 (0.0000) loss 8.9026 (7.3381) grad_norm 2.3742 (1.9512) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:12:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][40/625] eta 0:05:45 lr 0.000251 wd 0.0500 time 0.5726 (0.5907) data time 0.0007 (0.0139) model time 0.0000 (0.0000) loss 7.6678 (7.3698) grad_norm 1.9214 (1.9734) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:12:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][50/625] eta 0:05:37 lr 0.000251 wd 0.0500 time 0.5701 (0.5867) data time 0.0007 (0.0113) model time 0.0000 (0.0000) loss 6.7583 (7.3317) grad_norm 1.8434 (1.9795) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:12:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][60/625] eta 0:05:29 lr 0.000251 wd 0.0500 time 0.5657 (0.5838) data time 0.0006 (0.0096) model time 0.5651 (0.5682) loss 6.7391 (7.3322) grad_norm 1.6287 (1.9628) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:12:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][70/625] eta 0:05:24 lr 0.000251 wd 0.0500 time 0.7601 (0.5846) data time 0.0007 (0.0084) model time 0.7594 (0.5785) loss 8.3958 (7.2860) grad_norm 2.7847 (1.9667) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:13:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][80/625] eta 0:05:17 lr 0.000251 wd 0.0500 time 0.5701 (0.5822) data time 0.0006 (0.0074) model time 0.5695 (0.5737) loss 6.7062 (7.2627) grad_norm 1.7930 (2.0689) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:13:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][90/625] eta 0:05:10 lr 0.000251 wd 0.0500 time 0.5716 (0.5808) data time 0.0008 (0.0067) model time 0.5708 (0.5724) loss 6.5194 (7.2260) grad_norm 2.6791 (2.0893) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:13:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][100/625] eta 0:05:04 lr 0.000251 wd 0.0500 time 0.5686 (0.5796) data time 0.0008 (0.0061) model time 0.5678 (0.5715) loss 7.6865 (7.2467) grad_norm 2.4421 (2.1453) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:13:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][110/625] eta 0:04:58 lr 0.000251 wd 0.0500 time 0.5693 (0.5787) data time 0.0006 (0.0056) model time 0.5686 (0.5711) loss 8.4507 (7.2702) grad_norm 3.0640 (2.1931) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:13:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][120/625] eta 0:04:51 lr 0.000250 wd 0.0500 time 0.5718 (0.5781) data time 0.0006 (0.0052) model time 0.5712 (0.5711) loss 7.7148 (7.2385) grad_norm 2.3557 (2.2294) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:13:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][130/625] eta 0:04:45 lr 0.000250 wd 0.0500 time 0.5692 (0.5775) data time 0.0006 (0.0049) model time 0.5686 (0.5708) loss 6.2581 (7.2201) grad_norm 2.2673 (2.2800) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:13:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][140/625] eta 0:04:39 lr 0.000250 wd 0.0500 time 0.5700 (0.5770) data time 0.0007 (0.0046) model time 0.5693 (0.5707) loss 7.5201 (7.2024) grad_norm 2.5909 (2.3300) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:13:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][150/625] eta 0:04:33 lr 0.000250 wd 0.0500 time 0.5699 (0.5766) data time 0.0009 (0.0044) model time 0.5690 (0.5706) loss 6.0387 (7.1842) grad_norm 2.3908 (2.3237) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:13:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][160/625] eta 0:04:27 lr 0.000250 wd 0.0500 time 0.5702 (0.5761) data time 0.0008 (0.0041) model time 0.5694 (0.5704) loss 7.9847 (7.1974) grad_norm 2.1228 (2.3108) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:13:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][170/625] eta 0:04:21 lr 0.000250 wd 0.0500 time 0.5764 (0.5758) data time 0.0007 (0.0039) model time 0.5757 (0.5703) loss 6.5403 (7.1845) grad_norm 2.2780 (2.3263) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:13:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][180/625] eta 0:04:16 lr 0.000250 wd 0.0500 time 0.5714 (0.5755) data time 0.0008 (0.0038) model time 0.5705 (0.5703) loss 7.6913 (7.1743) grad_norm 1.8151 (2.3207) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:14:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][190/625] eta 0:04:10 lr 0.000250 wd 0.0500 time 0.5697 (0.5753) data time 0.0007 (0.0036) model time 0.5690 (0.5704) loss 7.4059 (7.1684) grad_norm 3.2206 (2.3154) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:14:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][200/625] eta 0:04:04 lr 0.000250 wd 0.0500 time 0.5731 (0.5751) data time 0.0006 (0.0035) model time 0.5725 (0.5703) loss 6.2421 (7.1496) grad_norm 2.0779 (2.3254) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:14:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][210/625] eta 0:03:58 lr 0.000250 wd 0.0500 time 0.5743 (0.5749) data time 0.0008 (0.0034) model time 0.5735 (0.5703) loss 7.3527 (7.1604) grad_norm 1.8963 (2.3185) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:14:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][220/625] eta 0:03:52 lr 0.000250 wd 0.0500 time 0.5691 (0.5747) data time 0.0007 (0.0033) model time 0.5684 (0.5702) loss 8.1065 (7.1756) grad_norm 2.0819 (2.3019) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:14:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][230/625] eta 0:03:47 lr 0.000250 wd 0.0500 time 0.7637 (0.5753) data time 0.0006 (0.0031) model time 0.7630 (0.5712) loss 6.6761 (7.1718) grad_norm 1.7673 (2.2892) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:14:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][240/625] eta 0:03:41 lr 0.000249 wd 0.0500 time 0.5702 (0.5764) data time 0.0006 (0.0030) model time 0.5696 (0.5727) loss 7.4073 (7.1811) grad_norm 3.3367 (2.2846) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:14:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][250/625] eta 0:03:36 lr 0.000249 wd 0.0500 time 0.5696 (0.5762) data time 0.0008 (0.0030) model time 0.5688 (0.5726) loss 7.5991 (7.1765) grad_norm 2.1805 (2.2734) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:14:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][260/625] eta 0:03:30 lr 0.000249 wd 0.0500 time 0.5689 (0.5759) data time 0.0006 (0.0029) model time 0.5683 (0.5725) loss 7.2974 (7.1765) grad_norm 1.7155 (2.2818) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:14:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][270/625] eta 0:03:24 lr 0.000249 wd 0.0500 time 0.5696 (0.5757) data time 0.0008 (0.0028) model time 0.5688 (0.5723) loss 6.6085 (7.1824) grad_norm 1.8567 (2.3175) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:14:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][280/625] eta 0:03:18 lr 0.000249 wd 0.0500 time 0.5685 (0.5755) data time 0.0007 (0.0027) model time 0.5678 (0.5722) loss 7.4349 (7.1926) grad_norm 2.6855 (2.3638) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:15:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][290/625] eta 0:03:12 lr 0.000249 wd 0.0500 time 0.5257 (0.5756) data time 0.0008 (0.0027) model time 0.5249 (0.5724) loss 6.9276 (7.1940) grad_norm 3.4747 (2.3785) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:15:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][300/625] eta 0:03:07 lr 0.000249 wd 0.0500 time 0.5664 (0.5754) data time 0.0009 (0.0026) model time 0.5655 (0.5723) loss 7.3710 (7.1992) grad_norm 2.5332 (2.3738) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:15:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][310/625] eta 0:03:01 lr 0.000249 wd 0.0500 time 0.5720 (0.5753) data time 0.0006 (0.0025) model time 0.5713 (0.5722) loss 7.4923 (7.2095) grad_norm 2.7416 (2.3754) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:15:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][320/625] eta 0:02:55 lr 0.000249 wd 0.0500 time 0.5726 (0.5752) data time 0.0008 (0.0025) model time 0.5717 (0.5722) loss 7.6296 (7.2261) grad_norm 13.1413 (2.4135) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:15:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][330/625] eta 0:02:49 lr 0.000249 wd 0.0500 time 0.5692 (0.5751) data time 0.0006 (0.0024) model time 0.5686 (0.5721) loss 6.2625 (7.2288) grad_norm 1.9709 (2.4147) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:15:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][340/625] eta 0:02:43 lr 0.000249 wd 0.0500 time 0.5678 (0.5750) data time 0.0008 (0.0024) model time 0.5670 (0.5721) loss 7.2581 (7.2343) grad_norm 1.8009 (2.4131) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:15:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][350/625] eta 0:02:38 lr 0.000248 wd 0.0500 time 0.5714 (0.5749) data time 0.0006 (0.0023) model time 0.5708 (0.5720) loss 6.2515 (7.2299) grad_norm 2.8913 (2.4817) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:15:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][360/625] eta 0:02:32 lr 0.000248 wd 0.0500 time 0.5702 (0.5747) data time 0.0008 (0.0023) model time 0.5694 (0.5719) loss 6.6352 (7.2341) grad_norm 2.5238 (2.4804) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:15:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][370/625] eta 0:02:26 lr 0.000248 wd 0.0500 time 0.5667 (0.5746) data time 0.0008 (0.0023) model time 0.5659 (0.5718) loss 8.3371 (7.2376) grad_norm 1.9912 (2.4811) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:15:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][380/625] eta 0:02:20 lr 0.000248 wd 0.0500 time 0.5672 (0.5744) data time 0.0010 (0.0022) model time 0.5662 (0.5717) loss 7.6520 (7.2433) grad_norm 2.9877 (2.4735) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:15:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][390/625] eta 0:02:14 lr 0.000248 wd 0.0500 time 0.5708 (0.5744) data time 0.0008 (0.0022) model time 0.5700 (0.5717) loss 7.1706 (7.2470) grad_norm 2.3458 (2.4714) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:16:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][400/625] eta 0:02:09 lr 0.000248 wd 0.0500 time 0.5701 (0.5743) data time 0.0009 (0.0022) model time 0.5692 (0.5716) loss 8.3774 (7.2548) grad_norm 1.6561 (2.4660) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:16:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][410/625] eta 0:02:03 lr 0.000248 wd 0.0500 time 0.5678 (0.5742) data time 0.0007 (0.0021) model time 0.5672 (0.5716) loss 7.3216 (7.2500) grad_norm 3.7473 (2.4586) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:16:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][420/625] eta 0:01:57 lr 0.000248 wd 0.0500 time 0.5683 (0.5742) data time 0.0006 (0.0021) model time 0.5677 (0.5716) loss 7.7897 (7.2558) grad_norm 3.0985 (2.4864) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:16:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][430/625] eta 0:01:51 lr 0.000248 wd 0.0500 time 0.5717 (0.5741) data time 0.0008 (0.0021) model time 0.5709 (0.5716) loss 8.5793 (7.2563) grad_norm 2.9049 (2.4874) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:16:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][440/625] eta 0:01:46 lr 0.000248 wd 0.0500 time 0.5669 (0.5740) data time 0.0007 (0.0020) model time 0.5661 (0.5715) loss 7.4516 (7.2551) grad_norm 2.1094 (2.4894) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:16:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][450/625] eta 0:01:40 lr 0.000248 wd 0.0500 time 0.5705 (0.5744) data time 0.0008 (0.0020) model time 0.5697 (0.5720) loss 8.3557 (7.2547) grad_norm 3.9986 (2.4985) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:16:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][460/625] eta 0:01:34 lr 0.000248 wd 0.0500 time 0.5702 (0.5753) data time 0.0006 (0.0020) model time 0.5696 (0.5731) loss 6.8733 (7.2581) grad_norm 2.0630 (2.4914) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:16:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][470/625] eta 0:01:29 lr 0.000247 wd 0.0500 time 0.5808 (0.5753) data time 0.0007 (0.0020) model time 0.5802 (0.5730) loss 8.3072 (7.2497) grad_norm 3.0432 (2.4922) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:16:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][480/625] eta 0:01:23 lr 0.000247 wd 0.0500 time 0.5702 (0.5752) data time 0.0008 (0.0019) model time 0.5694 (0.5730) loss 8.4633 (7.2674) grad_norm 1.9607 (2.4934) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:16:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][490/625] eta 0:01:17 lr 0.000247 wd 0.0500 time 0.5702 (0.5751) data time 0.0008 (0.0019) model time 0.5694 (0.5729) loss 7.2182 (7.2642) grad_norm 1.8938 (2.4850) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:17:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][500/625] eta 0:01:11 lr 0.000247 wd 0.0500 time 0.5757 (0.5750) data time 0.0008 (0.0019) model time 0.5749 (0.5729) loss 7.8817 (7.2597) grad_norm 2.4111 (2.4846) loss_scale 1024.0000 (513.0220) mem 22339MB +[2024-07-25 13:17:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][510/625] eta 0:01:06 lr 0.000247 wd 0.0500 time 0.7077 (0.5752) data time 0.0006 (0.0019) model time 0.7071 (0.5731) loss 6.2696 (7.2614) grad_norm 2.6578 (2.5027) loss_scale 1024.0000 (523.0215) mem 22339MB +[2024-07-25 13:17:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][520/625] eta 0:01:00 lr 0.000247 wd 0.0500 time 0.5730 (0.5751) data time 0.0008 (0.0019) model time 0.5722 (0.5730) loss 6.5506 (7.2564) grad_norm 1.8254 (2.4944) loss_scale 1024.0000 (532.6372) mem 22339MB +[2024-07-25 13:17:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][530/625] eta 0:00:54 lr 0.000247 wd 0.0500 time 0.6010 (0.5751) data time 0.0007 (0.0018) model time 0.6003 (0.5730) loss 6.1257 (7.2559) grad_norm 2.7580 (2.4875) loss_scale 1024.0000 (541.8908) mem 22339MB +[2024-07-25 13:17:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][540/625] eta 0:00:48 lr 0.000247 wd 0.0500 time 0.5684 (0.5750) data time 0.0006 (0.0018) model time 0.5678 (0.5730) loss 6.9578 (7.2542) grad_norm 2.0998 (2.4865) loss_scale 1024.0000 (550.8022) mem 22339MB +[2024-07-25 13:17:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][550/625] eta 0:00:43 lr 0.000247 wd 0.0500 time 0.5702 (0.5750) data time 0.0007 (0.0018) model time 0.5695 (0.5730) loss 7.8945 (7.2517) grad_norm 2.6958 (2.4896) loss_scale 1024.0000 (559.3902) mem 22339MB +[2024-07-25 13:17:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][560/625] eta 0:00:37 lr 0.000247 wd 0.0500 time 0.5696 (0.5749) data time 0.0010 (0.0018) model time 0.5687 (0.5729) loss 6.7094 (7.2459) grad_norm 2.6233 (2.5068) loss_scale 1024.0000 (567.6720) mem 22339MB +[2024-07-25 13:17:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][570/625] eta 0:00:31 lr 0.000247 wd 0.0500 time 0.5742 (0.5748) data time 0.0008 (0.0018) model time 0.5734 (0.5729) loss 7.4952 (7.2455) grad_norm 1.7634 (2.4994) loss_scale 1024.0000 (575.6637) mem 22339MB +[2024-07-25 13:17:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][580/625] eta 0:00:25 lr 0.000247 wd 0.0500 time 0.5712 (0.5748) data time 0.0008 (0.0017) model time 0.5704 (0.5728) loss 8.2851 (7.2477) grad_norm 2.1503 (2.4951) loss_scale 1024.0000 (583.3804) mem 22339MB +[2024-07-25 13:17:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][590/625] eta 0:00:20 lr 0.000246 wd 0.0500 time 0.5679 (0.5747) data time 0.0006 (0.0017) model time 0.5673 (0.5728) loss 6.3589 (7.2449) grad_norm 2.7020 (2.4905) loss_scale 1024.0000 (590.8359) mem 22339MB +[2024-07-25 13:18:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][600/625] eta 0:00:14 lr 0.000246 wd 0.0500 time 0.5733 (0.5746) data time 0.0006 (0.0017) model time 0.5727 (0.5727) loss 6.4228 (7.2396) grad_norm 2.5009 (2.4866) loss_scale 1024.0000 (598.0433) mem 22339MB +[2024-07-25 13:18:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][610/625] eta 0:00:08 lr 0.000246 wd 0.0500 time 0.5700 (0.5746) data time 0.0004 (0.0017) model time 0.5696 (0.5727) loss 7.6000 (7.2382) grad_norm 2.6964 (2.4854) loss_scale 1024.0000 (605.0147) mem 22339MB +[2024-07-25 13:18:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [217/300][620/625] eta 0:00:02 lr 0.000246 wd 0.0500 time 0.5672 (0.5745) data time 0.0006 (0.0017) model time 0.5666 (0.5726) loss 7.3305 (7.2414) grad_norm 2.4564 (2.4824) loss_scale 1024.0000 (611.7617) mem 22339MB +[2024-07-25 13:18:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 217 training takes 0:05:59 +[2024-07-25 13:18:14 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 13:18:15 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 13:18:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.471 (0.471) Loss 0.5005 (0.5005) Acc@1 90.381 (90.381) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 13:18:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.7632 (0.6141) Acc@1 82.422 (87.607) Acc@5 96.924 (98.096) Mem 22339MB +[2024-07-25 13:18:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.8525 (0.7055) Acc@1 79.590 (84.898) Acc@5 96.191 (97.177) Mem 22339MB +[2024-07-25 13:18:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.497 Acc@5 97.175 +[2024-07-25 13:18:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.5% +[2024-07-25 13:18:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 84.50% +[2024-07-25 13:18:19 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 13:18:20 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 13:18:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.464 (0.464) Loss 0.5054 (0.5054) Acc@1 90.430 (90.430) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 13:18:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.156) Loss 0.7490 (0.6201) Acc@1 83.301 (87.629) Acc@5 96.777 (98.016) Mem 22339MB +[2024-07-25 13:18:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.8618 (0.7091) Acc@1 78.955 (84.763) Acc@5 96.045 (97.194) Mem 22339MB +[2024-07-25 13:18:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.391 Acc@5 97.183 +[2024-07-25 13:18:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.4% +[2024-07-25 13:18:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.39% +[2024-07-25 13:18:24 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 13:18:25 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 13:18:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][0/625] eta 0:08:49 lr 0.000246 wd 0.0500 time 0.8467 (0.8467) data time 0.3225 (0.3225) model time 0.0000 (0.0000) loss 6.5855 (6.5855) grad_norm 2.3366 (2.3366) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:18:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][10/625] eta 0:06:07 lr 0.000246 wd 0.0500 time 0.5673 (0.5979) data time 0.0007 (0.0300) model time 0.0000 (0.0000) loss 6.6699 (7.2703) grad_norm 2.4333 (2.4670) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:18:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][20/625] eta 0:05:53 lr 0.000246 wd 0.0500 time 0.5716 (0.5847) data time 0.0007 (0.0161) model time 0.0000 (0.0000) loss 6.2809 (7.1993) grad_norm 1.7839 (2.3086) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:18:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][30/625] eta 0:05:44 lr 0.000246 wd 0.0500 time 0.5671 (0.5794) data time 0.0007 (0.0112) model time 0.0000 (0.0000) loss 7.6021 (7.1514) grad_norm 2.3135 (2.4666) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:18:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][40/625] eta 0:05:37 lr 0.000246 wd 0.0500 time 0.5663 (0.5768) data time 0.0008 (0.0087) model time 0.0000 (0.0000) loss 6.9930 (7.1322) grad_norm 1.9036 (2.5617) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:18:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][50/625] eta 0:05:35 lr 0.000246 wd 0.0500 time 0.5675 (0.5834) data time 0.0007 (0.0071) model time 0.0000 (0.0000) loss 7.1477 (7.1952) grad_norm 2.8373 (2.5269) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:19:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][60/625] eta 0:05:30 lr 0.000246 wd 0.0500 time 0.5649 (0.5843) data time 0.0008 (0.0061) model time 0.5641 (0.5883) loss 6.6260 (7.1593) grad_norm 2.4432 (2.6490) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:19:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][70/625] eta 0:05:23 lr 0.000246 wd 0.0500 time 0.5727 (0.5827) data time 0.0008 (0.0053) model time 0.5719 (0.5803) loss 6.5649 (7.2108) grad_norm 2.2708 (2.6528) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:19:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][80/625] eta 0:05:16 lr 0.000245 wd 0.0500 time 0.5735 (0.5814) data time 0.0008 (0.0048) model time 0.5727 (0.5772) loss 7.1818 (7.2809) grad_norm 3.5371 (2.6190) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:19:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][90/625] eta 0:05:10 lr 0.000245 wd 0.0500 time 0.5695 (0.5804) data time 0.0006 (0.0044) model time 0.5688 (0.5757) loss 8.2298 (7.3083) grad_norm 1.8016 (2.5779) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:19:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][100/625] eta 0:05:04 lr 0.000245 wd 0.0500 time 0.5648 (0.5794) data time 0.0007 (0.0040) model time 0.5641 (0.5746) loss 7.5847 (7.2715) grad_norm 2.6888 (2.5810) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:19:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][110/625] eta 0:04:58 lr 0.000245 wd 0.0500 time 0.5662 (0.5788) data time 0.0006 (0.0037) model time 0.5656 (0.5740) loss 7.5409 (7.2659) grad_norm 2.8060 (2.6227) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:19:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][120/625] eta 0:04:51 lr 0.000245 wd 0.0500 time 0.5700 (0.5782) data time 0.0010 (0.0035) model time 0.5691 (0.5736) loss 8.7363 (7.2576) grad_norm 2.8272 (2.6465) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:19:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][130/625] eta 0:04:45 lr 0.000245 wd 0.0500 time 0.5681 (0.5775) data time 0.0006 (0.0033) model time 0.5675 (0.5730) loss 6.5264 (7.2411) grad_norm 1.8811 (2.6055) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:19:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][140/625] eta 0:04:39 lr 0.000245 wd 0.0500 time 0.5665 (0.5771) data time 0.0008 (0.0031) model time 0.5657 (0.5727) loss 6.4784 (7.2340) grad_norm 2.2779 (2.5982) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:19:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][150/625] eta 0:04:33 lr 0.000245 wd 0.0500 time 0.5704 (0.5765) data time 0.0006 (0.0029) model time 0.5698 (0.5723) loss 7.6532 (7.2120) grad_norm 2.4703 (2.5798) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:19:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][160/625] eta 0:04:27 lr 0.000245 wd 0.0500 time 0.5645 (0.5763) data time 0.0009 (0.0028) model time 0.5637 (0.5722) loss 6.2717 (7.1991) grad_norm 1.7876 (2.5731) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:20:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][170/625] eta 0:04:22 lr 0.000245 wd 0.0500 time 0.5656 (0.5759) data time 0.0008 (0.0027) model time 0.5648 (0.5720) loss 7.1521 (7.1954) grad_norm 1.8393 (2.5568) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:20:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][180/625] eta 0:04:16 lr 0.000245 wd 0.0500 time 0.5712 (0.5759) data time 0.0006 (0.0026) model time 0.5706 (0.5722) loss 7.4324 (7.1934) grad_norm 2.6042 (2.5456) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:20:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][190/625] eta 0:04:10 lr 0.000245 wd 0.0500 time 0.5671 (0.5755) data time 0.0007 (0.0025) model time 0.5664 (0.5719) loss 7.5049 (7.1979) grad_norm 2.5513 (2.5260) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:20:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][200/625] eta 0:04:04 lr 0.000244 wd 0.0500 time 0.5666 (0.5753) data time 0.0006 (0.0024) model time 0.5660 (0.5717) loss 7.7516 (7.2164) grad_norm 1.9468 (2.5192) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:20:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][210/625] eta 0:03:58 lr 0.000244 wd 0.0500 time 0.5689 (0.5750) data time 0.0008 (0.0023) model time 0.5680 (0.5716) loss 7.7932 (7.2249) grad_norm 1.8182 (2.5043) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:20:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][220/625] eta 0:03:52 lr 0.000244 wd 0.0500 time 0.5665 (0.5748) data time 0.0008 (0.0023) model time 0.5657 (0.5714) loss 5.8535 (7.2109) grad_norm 2.4303 (2.4932) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:20:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][230/625] eta 0:03:47 lr 0.000244 wd 0.0500 time 0.5706 (0.5751) data time 0.0008 (0.0022) model time 0.5699 (0.5719) loss 9.0944 (7.2187) grad_norm 2.0483 (2.5449) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:20:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][240/625] eta 0:03:41 lr 0.000244 wd 0.0500 time 0.5701 (0.5749) data time 0.0008 (0.0021) model time 0.5693 (0.5718) loss 7.2429 (7.2304) grad_norm 1.9260 (2.5329) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:20:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][250/625] eta 0:03:35 lr 0.000244 wd 0.0500 time 0.5664 (0.5747) data time 0.0008 (0.0021) model time 0.5656 (0.5717) loss 6.3455 (7.2355) grad_norm 2.1740 (2.5289) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:20:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][260/625] eta 0:03:29 lr 0.000244 wd 0.0500 time 0.5703 (0.5745) data time 0.0006 (0.0020) model time 0.5697 (0.5715) loss 6.4210 (7.2185) grad_norm 2.0041 (2.5240) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:21:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][270/625] eta 0:03:24 lr 0.000244 wd 0.0500 time 0.5729 (0.5756) data time 0.0006 (0.0020) model time 0.5723 (0.5730) loss 8.3899 (7.2352) grad_norm 2.0846 (2.5076) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:21:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][280/625] eta 0:03:18 lr 0.000244 wd 0.0500 time 0.5691 (0.5768) data time 0.0008 (0.0020) model time 0.5683 (0.5745) loss 7.8968 (7.2447) grad_norm 2.8585 (2.5115) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:21:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][290/625] eta 0:03:13 lr 0.000244 wd 0.0500 time 0.5697 (0.5766) data time 0.0006 (0.0019) model time 0.5691 (0.5743) loss 6.2037 (7.2408) grad_norm 2.1950 (2.5096) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:21:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][300/625] eta 0:03:07 lr 0.000244 wd 0.0500 time 0.5704 (0.5764) data time 0.0007 (0.0019) model time 0.5696 (0.5742) loss 8.5174 (7.2307) grad_norm 2.2285 (2.5238) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:21:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][310/625] eta 0:03:01 lr 0.000244 wd 0.0500 time 0.5681 (0.5762) data time 0.0006 (0.0018) model time 0.5674 (0.5740) loss 7.7112 (7.2348) grad_norm 2.8141 (2.5229) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:21:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][320/625] eta 0:02:55 lr 0.000243 wd 0.0500 time 0.5693 (0.5760) data time 0.0008 (0.0018) model time 0.5685 (0.5738) loss 8.3134 (7.2375) grad_norm 2.0707 (2.5313) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:21:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][330/625] eta 0:02:49 lr 0.000243 wd 0.0500 time 0.5684 (0.5758) data time 0.0006 (0.0018) model time 0.5678 (0.5736) loss 6.2915 (7.2326) grad_norm 2.8100 (2.5271) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:21:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][340/625] eta 0:02:44 lr 0.000243 wd 0.0500 time 0.5687 (0.5756) data time 0.0006 (0.0017) model time 0.5681 (0.5735) loss 7.3665 (7.2408) grad_norm 2.0397 (2.5249) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:21:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][350/625] eta 0:02:38 lr 0.000243 wd 0.0500 time 0.5689 (0.5755) data time 0.0008 (0.0017) model time 0.5682 (0.5734) loss 7.5421 (7.2456) grad_norm 2.9189 (2.5328) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:21:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][360/625] eta 0:02:32 lr 0.000243 wd 0.0500 time 0.5835 (0.5754) data time 0.0006 (0.0017) model time 0.5828 (0.5733) loss 8.2184 (7.2419) grad_norm 2.4595 (2.5310) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:21:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][370/625] eta 0:02:26 lr 0.000243 wd 0.0500 time 0.5692 (0.5753) data time 0.0008 (0.0017) model time 0.5684 (0.5732) loss 7.9464 (7.2405) grad_norm 1.8145 (2.5189) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:22:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][380/625] eta 0:02:20 lr 0.000243 wd 0.0500 time 0.5673 (0.5751) data time 0.0008 (0.0016) model time 0.5665 (0.5731) loss 7.2570 (7.2366) grad_norm 2.6440 (2.5077) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:22:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][390/625] eta 0:02:15 lr 0.000243 wd 0.0500 time 0.5695 (0.5751) data time 0.0006 (0.0016) model time 0.5689 (0.5730) loss 6.9390 (7.2365) grad_norm 1.9261 (2.4971) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:22:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][400/625] eta 0:02:09 lr 0.000243 wd 0.0500 time 0.5687 (0.5749) data time 0.0006 (0.0016) model time 0.5681 (0.5729) loss 7.7941 (7.2406) grad_norm 2.6804 (2.4865) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:22:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][410/625] eta 0:02:03 lr 0.000243 wd 0.0500 time 0.5671 (0.5748) data time 0.0006 (0.0016) model time 0.5665 (0.5728) loss 6.4503 (7.2344) grad_norm 2.5170 (2.4764) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:22:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][420/625] eta 0:01:57 lr 0.000243 wd 0.0500 time 0.5693 (0.5747) data time 0.0008 (0.0016) model time 0.5685 (0.5727) loss 7.7941 (7.2344) grad_norm 2.4183 (2.4744) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:22:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][430/625] eta 0:01:52 lr 0.000243 wd 0.0500 time 0.5689 (0.5746) data time 0.0008 (0.0016) model time 0.5680 (0.5726) loss 7.7853 (7.2405) grad_norm 2.0430 (2.4682) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:22:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][440/625] eta 0:01:46 lr 0.000242 wd 0.0500 time 0.5676 (0.5745) data time 0.0007 (0.0015) model time 0.5669 (0.5726) loss 6.9819 (7.2458) grad_norm 2.2891 (2.4683) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:22:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][450/625] eta 0:01:40 lr 0.000242 wd 0.0500 time 0.5626 (0.5747) data time 0.0007 (0.0015) model time 0.5619 (0.5728) loss 6.6136 (7.2389) grad_norm 2.8454 (2.4708) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:22:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][460/625] eta 0:01:34 lr 0.000242 wd 0.0500 time 0.5717 (0.5746) data time 0.0008 (0.0015) model time 0.5709 (0.5727) loss 7.2140 (7.2331) grad_norm 2.1925 (2.4691) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:22:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][470/625] eta 0:01:29 lr 0.000242 wd 0.0500 time 0.5657 (0.5745) data time 0.0009 (0.0015) model time 0.5648 (0.5726) loss 6.1116 (7.2372) grad_norm 1.7897 (2.4762) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:23:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][480/625] eta 0:01:23 lr 0.000242 wd 0.0500 time 0.5646 (0.5744) data time 0.0008 (0.0015) model time 0.5638 (0.5725) loss 6.7259 (7.2425) grad_norm 3.4440 (2.4765) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:23:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][490/625] eta 0:01:17 lr 0.000242 wd 0.0500 time 0.5638 (0.5748) data time 0.0008 (0.0015) model time 0.5630 (0.5730) loss 6.7959 (7.2403) grad_norm 1.7729 (2.4780) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:23:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][500/625] eta 0:01:11 lr 0.000242 wd 0.0500 time 0.5681 (0.5757) data time 0.0008 (0.0014) model time 0.5673 (0.5740) loss 8.2042 (7.2564) grad_norm 2.8405 (2.4827) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:23:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][510/625] eta 0:01:06 lr 0.000242 wd 0.0500 time 0.5692 (0.5756) data time 0.0006 (0.0014) model time 0.5686 (0.5740) loss 6.7806 (7.2593) grad_norm 2.1155 (2.4787) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:23:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][520/625] eta 0:01:00 lr 0.000242 wd 0.0500 time 0.5700 (0.5755) data time 0.0008 (0.0014) model time 0.5692 (0.5739) loss 7.3422 (7.2551) grad_norm 2.0370 (2.4720) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:23:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][530/625] eta 0:00:54 lr 0.000242 wd 0.0500 time 0.5684 (0.5754) data time 0.0006 (0.0014) model time 0.5678 (0.5738) loss 6.8174 (7.2548) grad_norm 1.5566 (2.4712) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:23:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][540/625] eta 0:00:48 lr 0.000242 wd 0.0500 time 0.5702 (0.5753) data time 0.0006 (0.0014) model time 0.5696 (0.5737) loss 6.0233 (7.2565) grad_norm 2.5117 (2.4629) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:23:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][550/625] eta 0:00:43 lr 0.000242 wd 0.0500 time 0.5655 (0.5752) data time 0.0008 (0.0014) model time 0.5646 (0.5736) loss 7.0192 (7.2548) grad_norm 2.1112 (2.4574) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:23:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][560/625] eta 0:00:37 lr 0.000241 wd 0.0500 time 0.5637 (0.5752) data time 0.0008 (0.0014) model time 0.5629 (0.5735) loss 7.7109 (7.2596) grad_norm 3.5405 (2.4548) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:23:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][570/625] eta 0:00:31 lr 0.000241 wd 0.0500 time 0.5683 (0.5751) data time 0.0008 (0.0014) model time 0.5676 (0.5734) loss 7.8833 (7.2590) grad_norm 2.6435 (2.4564) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:23:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][580/625] eta 0:00:25 lr 0.000241 wd 0.0500 time 0.5677 (0.5751) data time 0.0008 (0.0014) model time 0.5669 (0.5735) loss 6.2085 (7.2605) grad_norm 2.4351 (2.4490) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:24:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][590/625] eta 0:00:20 lr 0.000241 wd 0.0500 time 0.5610 (0.5750) data time 0.0007 (0.0014) model time 0.5603 (0.5734) loss 8.5252 (7.2641) grad_norm 1.8717 (2.4435) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:24:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][600/625] eta 0:00:14 lr 0.000241 wd 0.0500 time 0.5712 (0.5750) data time 0.0006 (0.0013) model time 0.5706 (0.5734) loss 8.7248 (7.2719) grad_norm 2.4858 (2.4381) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:24:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][610/625] eta 0:00:08 lr 0.000241 wd 0.0500 time 0.5656 (0.5749) data time 0.0006 (0.0013) model time 0.5650 (0.5733) loss 7.8822 (7.2725) grad_norm 3.2310 (2.4385) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:24:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [218/300][620/625] eta 0:00:02 lr 0.000241 wd 0.0500 time 0.5627 (0.5748) data time 0.0004 (0.0013) model time 0.5623 (0.5732) loss 8.2339 (7.2770) grad_norm 2.3396 (2.4608) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:24:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 218 training takes 0:05:59 +[2024-07-25 13:24:24 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 13:24:26 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 13:24:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.478 (0.478) Loss 0.5083 (0.5083) Acc@1 90.039 (90.039) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 13:24:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.158) Loss 0.7642 (0.6175) Acc@1 82.373 (87.509) Acc@5 97.021 (98.025) Mem 22339MB +[2024-07-25 13:24:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8667 (0.7113) Acc@1 79.785 (84.773) Acc@5 96.094 (97.145) Mem 22339MB +[2024-07-25 13:24:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.349 Acc@5 97.133 +[2024-07-25 13:24:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.3% +[2024-07-25 13:24:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.876 (0.876) Loss 0.5054 (0.5054) Acc@1 90.479 (90.479) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 13:24:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.194) Loss 0.7480 (0.6195) Acc@1 83.350 (87.633) Acc@5 96.777 (98.029) Mem 22339MB +[2024-07-25 13:24:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.161) Loss 0.8613 (0.7085) Acc@1 79.102 (84.787) Acc@5 96.094 (97.203) Mem 22339MB +[2024-07-25 13:24:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.419 Acc@5 97.187 +[2024-07-25 13:24:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.4% +[2024-07-25 13:24:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.42% +[2024-07-25 13:24:33 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 13:24:35 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 13:24:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][0/625] eta 0:10:32 lr 0.000241 wd 0.0500 time 1.0119 (1.0119) data time 0.4937 (0.4937) model time 0.0000 (0.0000) loss 6.7874 (6.7874) grad_norm 1.9935 (1.9935) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:24:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][10/625] eta 0:06:16 lr 0.000241 wd 0.0500 time 0.5718 (0.6118) data time 0.0007 (0.0456) model time 0.0000 (0.0000) loss 6.5186 (7.0329) grad_norm 2.1630 (2.6141) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:24:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][20/625] eta 0:05:57 lr 0.000241 wd 0.0500 time 0.5666 (0.5916) data time 0.0006 (0.0243) model time 0.0000 (0.0000) loss 7.3621 (7.0199) grad_norm 2.3367 (2.5057) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:24:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][30/625] eta 0:05:48 lr 0.000241 wd 0.0500 time 0.5714 (0.5861) data time 0.0008 (0.0167) model time 0.0000 (0.0000) loss 8.2194 (7.1462) grad_norm 2.9518 (2.3452) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:24:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][40/625] eta 0:05:40 lr 0.000241 wd 0.0500 time 0.5635 (0.5820) data time 0.0006 (0.0128) model time 0.0000 (0.0000) loss 6.8964 (7.2967) grad_norm 2.6615 (2.3455) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:25:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][50/625] eta 0:05:34 lr 0.000240 wd 0.0500 time 0.5705 (0.5809) data time 0.0008 (0.0105) model time 0.0000 (0.0000) loss 6.5113 (7.2998) grad_norm 2.4021 (2.4617) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:25:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][60/625] eta 0:05:27 lr 0.000240 wd 0.0500 time 0.5630 (0.5793) data time 0.0008 (0.0089) model time 0.5623 (0.5701) loss 8.0630 (7.3251) grad_norm 2.3112 (2.4969) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:25:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][70/625] eta 0:05:20 lr 0.000240 wd 0.0500 time 0.5690 (0.5784) data time 0.0006 (0.0078) model time 0.5683 (0.5711) loss 7.2967 (7.3485) grad_norm 1.7639 (2.4364) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:25:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][80/625] eta 0:05:15 lr 0.000240 wd 0.0500 time 0.7130 (0.5792) data time 0.0006 (0.0069) model time 0.7124 (0.5755) loss 7.0835 (7.3164) grad_norm 2.0322 (2.3924) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:25:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][90/625] eta 0:05:13 lr 0.000240 wd 0.0500 time 0.5652 (0.5853) data time 0.0009 (0.0062) model time 0.5643 (0.5902) loss 8.2225 (7.3278) grad_norm 4.5494 (2.4524) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:25:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][100/625] eta 0:05:07 lr 0.000240 wd 0.0500 time 0.5644 (0.5861) data time 0.0006 (0.0057) model time 0.5638 (0.5905) loss 7.1480 (7.3589) grad_norm 2.0256 (2.4955) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:25:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][110/625] eta 0:05:01 lr 0.000240 wd 0.0500 time 0.5692 (0.5847) data time 0.0007 (0.0053) model time 0.5685 (0.5871) loss 6.9658 (7.3638) grad_norm 3.1553 (2.5320) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:25:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][120/625] eta 0:04:54 lr 0.000240 wd 0.0500 time 0.5626 (0.5838) data time 0.0008 (0.0049) model time 0.5619 (0.5851) loss 8.2033 (7.3727) grad_norm 4.2508 (2.5610) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:25:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][130/625] eta 0:04:48 lr 0.000240 wd 0.0500 time 0.5686 (0.5827) data time 0.0006 (0.0046) model time 0.5680 (0.5831) loss 7.5176 (7.3746) grad_norm 1.8020 (2.5626) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:25:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][140/625] eta 0:04:42 lr 0.000240 wd 0.0500 time 0.5645 (0.5822) data time 0.0007 (0.0043) model time 0.5639 (0.5822) loss 7.3994 (7.3668) grad_norm 2.1741 (2.6334) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:26:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][150/625] eta 0:04:36 lr 0.000240 wd 0.0500 time 0.5694 (0.5815) data time 0.0006 (0.0041) model time 0.5688 (0.5809) loss 6.9254 (7.3374) grad_norm 15.6328 (2.6930) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:26:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][160/625] eta 0:04:30 lr 0.000240 wd 0.0500 time 0.5694 (0.5809) data time 0.0008 (0.0039) model time 0.5686 (0.5800) loss 7.9023 (7.3282) grad_norm 1.9107 (2.6734) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:26:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][170/625] eta 0:04:24 lr 0.000239 wd 0.0500 time 0.5674 (0.5806) data time 0.0007 (0.0037) model time 0.5667 (0.5796) loss 6.3842 (7.3190) grad_norm 2.3318 (2.6638) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:26:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][180/625] eta 0:04:18 lr 0.000239 wd 0.0500 time 0.5744 (0.5800) data time 0.0007 (0.0035) model time 0.5737 (0.5789) loss 7.5447 (7.3012) grad_norm 6.7183 (2.6769) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:26:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][190/625] eta 0:04:12 lr 0.000239 wd 0.0500 time 0.5692 (0.5795) data time 0.0008 (0.0034) model time 0.5684 (0.5781) loss 7.3017 (7.3018) grad_norm 2.0372 (2.6589) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:26:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][200/625] eta 0:04:06 lr 0.000239 wd 0.0500 time 0.5679 (0.5791) data time 0.0006 (0.0033) model time 0.5673 (0.5776) loss 6.3981 (7.2857) grad_norm 1.8110 (2.6377) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:26:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][210/625] eta 0:04:00 lr 0.000239 wd 0.0500 time 0.5642 (0.5790) data time 0.0007 (0.0032) model time 0.5636 (0.5775) loss 6.8126 (7.2897) grad_norm 2.2982 (2.6107) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:26:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][220/625] eta 0:03:54 lr 0.000239 wd 0.0500 time 0.5668 (0.5785) data time 0.0008 (0.0030) model time 0.5660 (0.5769) loss 7.6781 (7.2927) grad_norm 2.1695 (2.6019) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:26:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][230/625] eta 0:03:48 lr 0.000239 wd 0.0500 time 0.5691 (0.5781) data time 0.0007 (0.0030) model time 0.5685 (0.5765) loss 6.9098 (7.2735) grad_norm 2.6473 (2.6140) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:26:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][240/625] eta 0:03:42 lr 0.000239 wd 0.0500 time 0.5679 (0.5778) data time 0.0006 (0.0029) model time 0.5673 (0.5762) loss 7.9445 (7.2684) grad_norm 2.1425 (2.6117) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:27:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][250/625] eta 0:03:36 lr 0.000239 wd 0.0500 time 0.5706 (0.5776) data time 0.0008 (0.0028) model time 0.5698 (0.5759) loss 7.3331 (7.2594) grad_norm 1.6931 (2.5937) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:27:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][260/625] eta 0:03:30 lr 0.000239 wd 0.0500 time 0.5687 (0.5773) data time 0.0006 (0.0027) model time 0.5681 (0.5756) loss 6.7868 (7.2415) grad_norm 2.9797 (2.5809) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:27:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][270/625] eta 0:03:24 lr 0.000239 wd 0.0500 time 0.5702 (0.5770) data time 0.0008 (0.0026) model time 0.5695 (0.5753) loss 5.9697 (7.2306) grad_norm 2.2508 (2.5659) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:27:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][280/625] eta 0:03:19 lr 0.000239 wd 0.0500 time 0.5691 (0.5769) data time 0.0008 (0.0026) model time 0.5683 (0.5752) loss 7.3645 (7.2271) grad_norm 2.2581 (2.5526) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:27:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][290/625] eta 0:03:13 lr 0.000238 wd 0.0500 time 0.5701 (0.5767) data time 0.0008 (0.0025) model time 0.5693 (0.5749) loss 7.9143 (7.2309) grad_norm 2.0236 (2.5468) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:27:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][300/625] eta 0:03:07 lr 0.000238 wd 0.0500 time 0.7505 (0.5771) data time 0.0006 (0.0025) model time 0.7499 (0.5755) loss 6.6669 (7.2217) grad_norm 3.4658 (2.5683) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:27:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][310/625] eta 0:03:02 lr 0.000238 wd 0.0500 time 0.7293 (0.5796) data time 0.0006 (0.0024) model time 0.7287 (0.5785) loss 7.8514 (7.2196) grad_norm 1.8678 (2.6122) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:27:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][320/625] eta 0:02:56 lr 0.000238 wd 0.0500 time 0.5680 (0.5800) data time 0.0008 (0.0024) model time 0.5673 (0.5790) loss 7.6190 (7.2207) grad_norm 2.7948 (2.6245) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:27:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][330/625] eta 0:02:50 lr 0.000238 wd 0.0500 time 0.5637 (0.5796) data time 0.0008 (0.0023) model time 0.5628 (0.5786) loss 7.8100 (7.2237) grad_norm 2.5360 (2.6116) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:27:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][340/625] eta 0:02:45 lr 0.000238 wd 0.0500 time 0.5692 (0.5794) data time 0.0008 (0.0023) model time 0.5683 (0.5783) loss 6.4861 (7.2352) grad_norm 2.5477 (2.6086) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:27:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][350/625] eta 0:02:39 lr 0.000238 wd 0.0500 time 0.5679 (0.5791) data time 0.0008 (0.0022) model time 0.5670 (0.5780) loss 5.3638 (7.2330) grad_norm 2.9549 (2.6073) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:28:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][360/625] eta 0:02:33 lr 0.000238 wd 0.0500 time 0.5684 (0.5788) data time 0.0008 (0.0022) model time 0.5676 (0.5777) loss 7.2711 (7.2376) grad_norm 3.9521 (2.6057) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:28:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][370/625] eta 0:02:27 lr 0.000238 wd 0.0500 time 0.5647 (0.5788) data time 0.0007 (0.0022) model time 0.5639 (0.5776) loss 5.8779 (7.2208) grad_norm 2.8334 (2.6523) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:28:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][380/625] eta 0:02:21 lr 0.000238 wd 0.0500 time 0.5693 (0.5788) data time 0.0007 (0.0021) model time 0.5687 (0.5777) loss 7.9045 (7.2343) grad_norm 1.8716 (2.6484) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:28:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][390/625] eta 0:02:16 lr 0.000238 wd 0.0500 time 0.5695 (0.5788) data time 0.0006 (0.0021) model time 0.5689 (0.5776) loss 7.4684 (7.2449) grad_norm 1.7740 (2.6407) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:28:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][400/625] eta 0:02:10 lr 0.000238 wd 0.0500 time 0.5672 (0.5786) data time 0.0008 (0.0021) model time 0.5664 (0.5774) loss 7.5617 (7.2418) grad_norm 2.1449 (2.6341) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:28:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][410/625] eta 0:02:04 lr 0.000237 wd 0.0500 time 0.5687 (0.5784) data time 0.0007 (0.0020) model time 0.5680 (0.5773) loss 7.1321 (7.2466) grad_norm 2.6299 (2.6497) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:28:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][420/625] eta 0:01:58 lr 0.000237 wd 0.0500 time 0.5693 (0.5783) data time 0.0009 (0.0020) model time 0.5684 (0.5772) loss 8.3390 (7.2570) grad_norm 2.4096 (2.6598) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:28:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][430/625] eta 0:01:52 lr 0.000237 wd 0.0500 time 0.5675 (0.5787) data time 0.0007 (0.0020) model time 0.5668 (0.5776) loss 7.0128 (7.2515) grad_norm 1.9932 (2.6637) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:28:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][440/625] eta 0:01:47 lr 0.000237 wd 0.0500 time 0.5740 (0.5786) data time 0.0007 (0.0019) model time 0.5733 (0.5774) loss 7.0730 (7.2386) grad_norm 2.1060 (2.6918) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:28:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][450/625] eta 0:01:41 lr 0.000237 wd 0.0500 time 0.5689 (0.5784) data time 0.0006 (0.0019) model time 0.5683 (0.5772) loss 5.8882 (7.2349) grad_norm 2.7517 (2.6956) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:29:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][460/625] eta 0:01:35 lr 0.000237 wd 0.0500 time 0.5665 (0.5783) data time 0.0009 (0.0019) model time 0.5656 (0.5771) loss 7.6725 (7.2348) grad_norm 3.1479 (2.6908) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:29:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][470/625] eta 0:01:29 lr 0.000237 wd 0.0500 time 0.5646 (0.5781) data time 0.0006 (0.0019) model time 0.5640 (0.5769) loss 7.8219 (7.2377) grad_norm 4.0974 (2.6883) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:29:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][480/625] eta 0:01:23 lr 0.000237 wd 0.0500 time 0.5687 (0.5779) data time 0.0008 (0.0019) model time 0.5679 (0.5767) loss 6.1536 (7.2285) grad_norm 2.4611 (2.6891) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:29:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][490/625] eta 0:01:17 lr 0.000237 wd 0.0500 time 0.5646 (0.5778) data time 0.0006 (0.0018) model time 0.5640 (0.5766) loss 8.1815 (7.2302) grad_norm 2.9941 (2.6875) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:29:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][500/625] eta 0:01:12 lr 0.000237 wd 0.0500 time 0.5641 (0.5776) data time 0.0007 (0.0018) model time 0.5633 (0.5764) loss 7.5841 (7.2254) grad_norm 2.5041 (2.6827) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:29:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][510/625] eta 0:01:06 lr 0.000237 wd 0.0500 time 0.5612 (0.5775) data time 0.0008 (0.0018) model time 0.5603 (0.5763) loss 6.7744 (7.2207) grad_norm 2.3384 (2.6731) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:29:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][520/625] eta 0:01:00 lr 0.000237 wd 0.0500 time 0.5730 (0.5777) data time 0.0007 (0.0018) model time 0.5723 (0.5765) loss 6.5226 (7.2240) grad_norm 2.8264 (2.6678) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:29:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][530/625] eta 0:00:54 lr 0.000236 wd 0.0500 time 0.5675 (0.5783) data time 0.0006 (0.0018) model time 0.5669 (0.5772) loss 7.1109 (7.2227) grad_norm 2.6427 (2.6580) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:29:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][540/625] eta 0:00:49 lr 0.000236 wd 0.0500 time 0.5699 (0.5783) data time 0.0007 (0.0017) model time 0.5692 (0.5772) loss 7.9032 (7.2268) grad_norm 2.2262 (2.6508) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:29:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][550/625] eta 0:00:43 lr 0.000236 wd 0.0500 time 0.5684 (0.5782) data time 0.0007 (0.0017) model time 0.5677 (0.5771) loss 6.8416 (7.2281) grad_norm 2.0414 (2.6460) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:29:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][560/625] eta 0:00:37 lr 0.000236 wd 0.0500 time 0.5680 (0.5781) data time 0.0009 (0.0017) model time 0.5672 (0.5769) loss 8.6039 (7.2267) grad_norm 2.2058 (2.6389) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:30:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][570/625] eta 0:00:31 lr 0.000236 wd 0.0500 time 0.5645 (0.5779) data time 0.0008 (0.0017) model time 0.5637 (0.5768) loss 7.2859 (7.2347) grad_norm 2.0482 (2.6435) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:30:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][580/625] eta 0:00:26 lr 0.000236 wd 0.0500 time 0.5711 (0.5778) data time 0.0008 (0.0017) model time 0.5704 (0.5767) loss 5.9802 (7.2276) grad_norm 3.7685 (2.6487) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:30:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][590/625] eta 0:00:20 lr 0.000236 wd 0.0500 time 0.5708 (0.5777) data time 0.0008 (0.0017) model time 0.5700 (0.5766) loss 6.6359 (7.2255) grad_norm 3.0507 (2.6640) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:30:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][600/625] eta 0:00:14 lr 0.000236 wd 0.0500 time 0.5687 (0.5776) data time 0.0006 (0.0017) model time 0.5681 (0.5765) loss 8.2078 (7.2267) grad_norm 3.1517 (2.6733) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:30:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][610/625] eta 0:00:08 lr 0.000236 wd 0.0500 time 0.5673 (0.5775) data time 0.0004 (0.0016) model time 0.5669 (0.5763) loss 7.2182 (7.2269) grad_norm 2.2328 (2.6792) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:30:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [219/300][620/625] eta 0:00:02 lr 0.000236 wd 0.0500 time 0.5687 (0.5774) data time 0.0004 (0.0016) model time 0.5683 (0.5762) loss 7.2307 (7.2281) grad_norm 2.7442 (2.6776) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:30:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 219 training takes 0:06:00 +[2024-07-25 13:30:36 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 13:30:37 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 13:30:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.461 (0.461) Loss 0.5049 (0.5049) Acc@1 89.600 (89.600) Acc@5 98.828 (98.828) Mem 22339MB +[2024-07-25 13:30:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.156) Loss 0.7627 (0.6165) Acc@1 82.373 (87.456) Acc@5 96.924 (98.002) Mem 22339MB +[2024-07-25 13:30:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.8467 (0.7071) Acc@1 79.834 (84.675) Acc@5 96.094 (97.161) Mem 22339MB +[2024-07-25 13:30:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.343 Acc@5 97.161 +[2024-07-25 13:30:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.3% +[2024-07-25 13:30:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.809 (0.809) Loss 0.5059 (0.5059) Acc@1 90.527 (90.527) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 13:30:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.188) Loss 0.7476 (0.6193) Acc@1 83.350 (87.660) Acc@5 96.875 (98.047) Mem 22339MB +[2024-07-25 13:30:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.158) Loss 0.8604 (0.7082) Acc@1 79.199 (84.810) Acc@5 96.143 (97.214) Mem 22339MB +[2024-07-25 13:30:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.439 Acc@5 97.197 +[2024-07-25 13:30:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.4% +[2024-07-25 13:30:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.44% +[2024-07-25 13:30:45 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 13:30:47 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 13:30:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][0/625] eta 0:10:02 lr 0.000236 wd 0.0500 time 0.9639 (0.9639) data time 0.4460 (0.4460) model time 0.0000 (0.0000) loss 7.2101 (7.2101) grad_norm 1.8053 (1.8053) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:30:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][10/625] eta 0:06:11 lr 0.000236 wd 0.0500 time 0.5721 (0.6048) data time 0.0008 (0.0413) model time 0.0000 (0.0000) loss 7.7545 (7.0459) grad_norm 2.3764 (2.3169) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:30:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][20/625] eta 0:05:55 lr 0.000235 wd 0.0500 time 0.5706 (0.5882) data time 0.0008 (0.0220) model time 0.0000 (0.0000) loss 7.1327 (7.4017) grad_norm 2.1367 (2.6112) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:31:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][30/625] eta 0:05:46 lr 0.000235 wd 0.0500 time 0.5725 (0.5828) data time 0.0006 (0.0151) model time 0.0000 (0.0000) loss 9.0549 (7.3191) grad_norm 3.0077 (2.6597) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:31:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][40/625] eta 0:05:39 lr 0.000235 wd 0.0500 time 0.5680 (0.5798) data time 0.0006 (0.0117) model time 0.0000 (0.0000) loss 8.3442 (7.3174) grad_norm 2.0178 (2.5731) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:31:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][50/625] eta 0:05:32 lr 0.000235 wd 0.0500 time 0.5641 (0.5777) data time 0.0008 (0.0095) model time 0.0000 (0.0000) loss 7.0481 (7.2896) grad_norm 1.8917 (2.6186) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:31:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][60/625] eta 0:05:25 lr 0.000235 wd 0.0500 time 0.5650 (0.5764) data time 0.0006 (0.0081) model time 0.5644 (0.5687) loss 6.0178 (7.2769) grad_norm 2.2909 (2.6376) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:31:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][70/625] eta 0:05:19 lr 0.000235 wd 0.0500 time 0.5659 (0.5752) data time 0.0009 (0.0071) model time 0.5650 (0.5682) loss 6.6359 (7.2877) grad_norm 2.1842 (2.8201) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:31:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][80/625] eta 0:05:13 lr 0.000235 wd 0.0500 time 0.5655 (0.5744) data time 0.0008 (0.0063) model time 0.5647 (0.5680) loss 7.2257 (7.3233) grad_norm 2.3117 (2.8265) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:31:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][90/625] eta 0:05:07 lr 0.000235 wd 0.0500 time 0.5653 (0.5738) data time 0.0006 (0.0057) model time 0.5647 (0.5681) loss 5.6649 (7.3171) grad_norm 2.1584 (2.7902) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:31:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][100/625] eta 0:05:01 lr 0.000235 wd 0.0500 time 0.5670 (0.5736) data time 0.0007 (0.0052) model time 0.5663 (0.5686) loss 8.0007 (7.2907) grad_norm 2.7896 (2.8561) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:31:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][110/625] eta 0:04:55 lr 0.000235 wd 0.0500 time 0.5695 (0.5733) data time 0.0007 (0.0048) model time 0.5688 (0.5688) loss 7.1111 (7.2655) grad_norm 1.8400 (2.8027) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:31:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][120/625] eta 0:04:51 lr 0.000235 wd 0.0500 time 0.7595 (0.5773) data time 0.0006 (0.0045) model time 0.7589 (0.5761) loss 6.6317 (7.2840) grad_norm 3.4654 (2.7771) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:32:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][130/625] eta 0:04:46 lr 0.000235 wd 0.0500 time 0.5682 (0.5786) data time 0.0006 (0.0042) model time 0.5676 (0.5783) loss 6.0257 (7.3027) grad_norm 1.8714 (2.7356) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:32:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][140/625] eta 0:04:40 lr 0.000234 wd 0.0500 time 0.5664 (0.5780) data time 0.0007 (0.0040) model time 0.5657 (0.5773) loss 8.1602 (7.2780) grad_norm 2.9842 (2.7667) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:32:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][150/625] eta 0:04:34 lr 0.000234 wd 0.0500 time 0.5693 (0.5775) data time 0.0006 (0.0038) model time 0.5687 (0.5766) loss 7.2524 (7.2483) grad_norm 7.6326 (2.7965) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:32:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][160/625] eta 0:04:28 lr 0.000234 wd 0.0500 time 0.5665 (0.5771) data time 0.0006 (0.0036) model time 0.5659 (0.5759) loss 6.0550 (7.2365) grad_norm 2.8388 (2.9894) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:32:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][170/625] eta 0:04:22 lr 0.000234 wd 0.0500 time 0.5711 (0.5767) data time 0.0008 (0.0034) model time 0.5703 (0.5754) loss 8.2970 (7.2214) grad_norm 2.4363 (3.0452) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:32:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][180/625] eta 0:04:16 lr 0.000234 wd 0.0500 time 0.5689 (0.5766) data time 0.0008 (0.0033) model time 0.5680 (0.5753) loss 7.4029 (7.2276) grad_norm 2.9619 (3.0639) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:32:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][190/625] eta 0:04:10 lr 0.000234 wd 0.0500 time 0.5700 (0.5766) data time 0.0007 (0.0032) model time 0.5693 (0.5754) loss 7.5263 (7.2266) grad_norm 2.0861 (3.0465) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:32:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][200/625] eta 0:04:04 lr 0.000234 wd 0.0500 time 0.5686 (0.5763) data time 0.0006 (0.0030) model time 0.5680 (0.5750) loss 5.9258 (7.2405) grad_norm 2.1654 (3.0060) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:32:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][210/625] eta 0:03:59 lr 0.000234 wd 0.0500 time 0.5648 (0.5760) data time 0.0008 (0.0029) model time 0.5639 (0.5747) loss 6.9536 (7.2385) grad_norm 2.1361 (2.9561) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:32:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][220/625] eta 0:03:53 lr 0.000234 wd 0.0500 time 0.5675 (0.5757) data time 0.0006 (0.0028) model time 0.5669 (0.5743) loss 7.1433 (7.2475) grad_norm 2.0995 (2.9333) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:33:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][230/625] eta 0:03:47 lr 0.000234 wd 0.0500 time 0.5676 (0.5755) data time 0.0007 (0.0028) model time 0.5669 (0.5740) loss 7.4671 (7.2508) grad_norm 2.5582 (2.9526) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:33:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][240/625] eta 0:03:41 lr 0.000234 wd 0.0500 time 0.5695 (0.5754) data time 0.0007 (0.0027) model time 0.5688 (0.5740) loss 6.8673 (7.2424) grad_norm 3.9074 (2.9472) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:33:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][250/625] eta 0:03:35 lr 0.000234 wd 0.0500 time 0.5684 (0.5753) data time 0.0006 (0.0026) model time 0.5677 (0.5738) loss 5.7997 (7.2543) grad_norm 2.3052 (2.9523) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:33:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][260/625] eta 0:03:29 lr 0.000233 wd 0.0500 time 0.5674 (0.5750) data time 0.0008 (0.0025) model time 0.5666 (0.5736) loss 6.0186 (7.2531) grad_norm 12.0573 (3.0094) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:33:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][270/625] eta 0:03:24 lr 0.000233 wd 0.0500 time 0.5657 (0.5749) data time 0.0006 (0.0025) model time 0.5650 (0.5734) loss 6.3965 (7.2491) grad_norm 5.4494 (3.0160) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:33:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][280/625] eta 0:03:18 lr 0.000233 wd 0.0500 time 0.5696 (0.5747) data time 0.0008 (0.0024) model time 0.5688 (0.5732) loss 7.7741 (7.2547) grad_norm 3.1284 (3.0204) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:33:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][290/625] eta 0:03:12 lr 0.000233 wd 0.0500 time 0.5708 (0.5746) data time 0.0006 (0.0024) model time 0.5702 (0.5731) loss 7.0939 (7.2507) grad_norm 2.6322 (3.0084) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:33:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][300/625] eta 0:03:06 lr 0.000233 wd 0.0500 time 0.5700 (0.5744) data time 0.0006 (0.0023) model time 0.5694 (0.5730) loss 6.0704 (7.2540) grad_norm 3.0208 (3.0047) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:33:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][310/625] eta 0:03:00 lr 0.000233 wd 0.0500 time 0.5709 (0.5743) data time 0.0008 (0.0023) model time 0.5701 (0.5728) loss 5.8052 (7.2524) grad_norm 3.1108 (2.9914) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:33:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][320/625] eta 0:02:55 lr 0.000233 wd 0.0500 time 0.5701 (0.5742) data time 0.0008 (0.0022) model time 0.5693 (0.5728) loss 7.4293 (7.2483) grad_norm 2.6523 (2.9647) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:33:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][330/625] eta 0:02:49 lr 0.000233 wd 0.0500 time 0.5720 (0.5741) data time 0.0006 (0.0022) model time 0.5713 (0.5727) loss 7.0197 (7.2449) grad_norm 1.9878 (2.9888) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:34:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][340/625] eta 0:02:43 lr 0.000233 wd 0.0500 time 0.5679 (0.5745) data time 0.0007 (0.0021) model time 0.5673 (0.5732) loss 8.6856 (7.2495) grad_norm 1.7144 (2.9907) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:34:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][350/625] eta 0:02:38 lr 0.000233 wd 0.0500 time 0.7263 (0.5767) data time 0.0006 (0.0021) model time 0.7257 (0.5757) loss 5.8501 (7.2469) grad_norm 2.8417 (2.9993) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:34:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][360/625] eta 0:02:32 lr 0.000233 wd 0.0500 time 0.5684 (0.5765) data time 0.0006 (0.0021) model time 0.5678 (0.5754) loss 5.5319 (7.2418) grad_norm 2.1715 (3.0050) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:34:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][370/625] eta 0:02:26 lr 0.000233 wd 0.0500 time 0.5696 (0.5763) data time 0.0007 (0.0020) model time 0.5689 (0.5753) loss 6.3902 (7.2453) grad_norm 2.0255 (2.9816) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:34:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][380/625] eta 0:02:21 lr 0.000232 wd 0.0500 time 0.5701 (0.5761) data time 0.0006 (0.0020) model time 0.5694 (0.5751) loss 7.4550 (7.2439) grad_norm 1.8167 (3.0337) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:34:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][390/625] eta 0:02:15 lr 0.000232 wd 0.0500 time 0.5704 (0.5760) data time 0.0008 (0.0020) model time 0.5696 (0.5750) loss 6.7773 (7.2402) grad_norm 1.9929 (3.0077) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:34:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][400/625] eta 0:02:09 lr 0.000232 wd 0.0500 time 0.5743 (0.5759) data time 0.0008 (0.0019) model time 0.5734 (0.5749) loss 8.3416 (7.2406) grad_norm 3.3021 (3.0202) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:34:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][410/625] eta 0:02:03 lr 0.000232 wd 0.0500 time 0.5670 (0.5760) data time 0.0006 (0.0019) model time 0.5663 (0.5749) loss 7.3204 (7.2397) grad_norm 2.3072 (3.0102) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:34:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][420/625] eta 0:01:58 lr 0.000232 wd 0.0500 time 0.5628 (0.5760) data time 0.0008 (0.0019) model time 0.5620 (0.5749) loss 7.6504 (7.2428) grad_norm 3.7648 (3.0079) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:34:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][430/625] eta 0:01:52 lr 0.000232 wd 0.0500 time 0.5663 (0.5761) data time 0.0008 (0.0018) model time 0.5655 (0.5750) loss 8.4744 (7.2375) grad_norm 2.6258 (2.9978) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:35:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][440/625] eta 0:01:46 lr 0.000232 wd 0.0500 time 0.5697 (0.5760) data time 0.0007 (0.0018) model time 0.5690 (0.5750) loss 6.6462 (7.2327) grad_norm 4.7693 (2.9836) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:35:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][450/625] eta 0:01:40 lr 0.000232 wd 0.0500 time 0.5683 (0.5759) data time 0.0008 (0.0018) model time 0.5676 (0.5748) loss 6.6568 (7.2233) grad_norm 1.8928 (2.9695) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:35:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][460/625] eta 0:01:35 lr 0.000232 wd 0.0500 time 0.5686 (0.5758) data time 0.0007 (0.0018) model time 0.5679 (0.5747) loss 7.2862 (7.2255) grad_norm 1.9714 (2.9591) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:35:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][470/625] eta 0:01:29 lr 0.000232 wd 0.0500 time 0.5708 (0.5757) data time 0.0006 (0.0018) model time 0.5702 (0.5746) loss 7.2480 (7.2288) grad_norm 2.9776 (2.9559) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:35:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][480/625] eta 0:01:23 lr 0.000232 wd 0.0500 time 0.5690 (0.5756) data time 0.0006 (0.0017) model time 0.5684 (0.5746) loss 7.6989 (7.2307) grad_norm 3.1229 (2.9936) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:35:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][490/625] eta 0:01:17 lr 0.000232 wd 0.0500 time 0.5710 (0.5757) data time 0.0008 (0.0017) model time 0.5702 (0.5747) loss 8.2686 (7.2311) grad_norm 2.2231 (2.9915) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:35:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][500/625] eta 0:01:11 lr 0.000231 wd 0.0500 time 0.5700 (0.5756) data time 0.0006 (0.0017) model time 0.5694 (0.5746) loss 7.5973 (7.2289) grad_norm 1.5818 (2.9833) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:35:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][510/625] eta 0:01:06 lr 0.000231 wd 0.0500 time 0.5676 (0.5755) data time 0.0008 (0.0017) model time 0.5668 (0.5745) loss 6.7074 (7.2234) grad_norm 1.8053 (2.9629) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:35:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][520/625] eta 0:01:00 lr 0.000231 wd 0.0500 time 0.5669 (0.5754) data time 0.0006 (0.0017) model time 0.5663 (0.5744) loss 7.5508 (7.2310) grad_norm 1.7476 (2.9566) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:35:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][530/625] eta 0:00:54 lr 0.000231 wd 0.0500 time 0.5692 (0.5754) data time 0.0008 (0.0017) model time 0.5683 (0.5743) loss 7.7687 (7.2324) grad_norm 3.0822 (2.9491) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:35:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][540/625] eta 0:00:48 lr 0.000231 wd 0.0500 time 0.5636 (0.5753) data time 0.0008 (0.0016) model time 0.5628 (0.5742) loss 8.4003 (7.2367) grad_norm 3.2790 (2.9601) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:36:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][550/625] eta 0:00:43 lr 0.000231 wd 0.0500 time 0.5685 (0.5754) data time 0.0007 (0.0016) model time 0.5678 (0.5743) loss 7.5534 (7.2373) grad_norm 2.8203 (2.9752) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:36:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][560/625] eta 0:00:37 lr 0.000231 wd 0.0500 time 0.5668 (0.5757) data time 0.0009 (0.0016) model time 0.5659 (0.5746) loss 8.3828 (7.2393) grad_norm 3.7166 (2.9795) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:36:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][570/625] eta 0:00:31 lr 0.000231 wd 0.0500 time 0.5627 (0.5768) data time 0.0007 (0.0016) model time 0.5620 (0.5759) loss 6.0564 (7.2369) grad_norm 3.8973 (2.9814) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:36:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][580/625] eta 0:00:25 lr 0.000231 wd 0.0500 time 0.5670 (0.5767) data time 0.0008 (0.0016) model time 0.5663 (0.5758) loss 6.8425 (7.2353) grad_norm 2.7253 (2.9795) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:36:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][590/625] eta 0:00:20 lr 0.000231 wd 0.0500 time 0.5654 (0.5766) data time 0.0007 (0.0016) model time 0.5647 (0.5757) loss 5.9387 (7.2427) grad_norm 2.1488 (2.9691) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:36:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][600/625] eta 0:00:14 lr 0.000231 wd 0.0500 time 0.5710 (0.5766) data time 0.0010 (0.0016) model time 0.5700 (0.5756) loss 8.2714 (7.2435) grad_norm 15.0017 (2.9855) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:36:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][610/625] eta 0:00:08 lr 0.000231 wd 0.0500 time 0.5669 (0.5765) data time 0.0006 (0.0016) model time 0.5663 (0.5756) loss 6.9618 (7.2460) grad_norm 3.3284 (2.9800) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:36:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [220/300][620/625] eta 0:00:02 lr 0.000231 wd 0.0500 time 0.5625 (0.5765) data time 0.0006 (0.0016) model time 0.5619 (0.5755) loss 6.1411 (7.2379) grad_norm 2.2545 (2.9896) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:36:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 220 training takes 0:06:00 +[2024-07-25 13:36:47 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 13:36:49 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 13:36:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.488 (0.488) Loss 0.5151 (0.5151) Acc@1 90.283 (90.283) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-25 13:36:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.159) Loss 0.7725 (0.6225) Acc@1 82.080 (87.518) Acc@5 96.777 (98.029) Mem 22339MB +[2024-07-25 13:36:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.143) Loss 0.8647 (0.7152) Acc@1 79.736 (84.766) Acc@5 95.850 (97.135) Mem 22339MB +[2024-07-25 13:36:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.433 Acc@5 97.113 +[2024-07-25 13:36:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.4% +[2024-07-25 13:36:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.798 (0.798) Loss 0.5063 (0.5063) Acc@1 90.527 (90.527) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 13:36:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.187) Loss 0.7471 (0.6190) Acc@1 83.252 (87.673) Acc@5 96.875 (98.047) Mem 22339MB +[2024-07-25 13:36:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.158) Loss 0.8594 (0.7077) Acc@1 79.102 (84.814) Acc@5 96.191 (97.231) Mem 22339MB +[2024-07-25 13:36:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.447 Acc@5 97.213 +[2024-07-25 13:36:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.4% +[2024-07-25 13:36:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.45% +[2024-07-25 13:36:56 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 13:36:58 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 13:36:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][0/625] eta 0:09:27 lr 0.000230 wd 0.0500 time 0.9082 (0.9082) data time 0.3897 (0.3897) model time 0.0000 (0.0000) loss 7.4153 (7.4153) grad_norm 2.6917 (2.6917) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:37:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][10/625] eta 0:06:09 lr 0.000230 wd 0.0500 time 0.5653 (0.6004) data time 0.0008 (0.0362) model time 0.0000 (0.0000) loss 6.1151 (7.1030) grad_norm 2.0178 (2.1933) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:37:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][20/625] eta 0:05:54 lr 0.000230 wd 0.0500 time 0.5666 (0.5860) data time 0.0007 (0.0193) model time 0.0000 (0.0000) loss 7.1592 (7.3866) grad_norm 4.0556 (2.4456) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:37:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][30/625] eta 0:05:45 lr 0.000230 wd 0.0500 time 0.5652 (0.5807) data time 0.0007 (0.0133) model time 0.0000 (0.0000) loss 7.2797 (7.4481) grad_norm 2.5110 (2.5405) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:37:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][40/625] eta 0:05:39 lr 0.000230 wd 0.0500 time 0.5632 (0.5803) data time 0.0007 (0.0103) model time 0.0000 (0.0000) loss 7.5243 (7.4577) grad_norm 3.5110 (2.6755) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:37:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][50/625] eta 0:05:32 lr 0.000230 wd 0.0500 time 0.5652 (0.5790) data time 0.0008 (0.0084) model time 0.0000 (0.0000) loss 6.1517 (7.2852) grad_norm 2.6645 (2.6468) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:37:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][60/625] eta 0:05:26 lr 0.000230 wd 0.0500 time 0.5644 (0.5780) data time 0.0006 (0.0072) model time 0.5637 (0.5721) loss 7.9496 (7.2541) grad_norm 4.1030 (2.6990) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:37:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][70/625] eta 0:05:20 lr 0.000230 wd 0.0500 time 0.5675 (0.5771) data time 0.0006 (0.0063) model time 0.5669 (0.5714) loss 6.1410 (7.2326) grad_norm 2.5699 (2.6438) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:37:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][80/625] eta 0:05:14 lr 0.000230 wd 0.0500 time 0.5634 (0.5765) data time 0.0008 (0.0056) model time 0.5626 (0.5715) loss 6.4587 (7.2866) grad_norm 2.0416 (2.6205) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:37:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][90/625] eta 0:05:08 lr 0.000230 wd 0.0500 time 0.5693 (0.5758) data time 0.0007 (0.0051) model time 0.5686 (0.5710) loss 7.0614 (7.3241) grad_norm 2.5429 (2.5857) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:37:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][100/625] eta 0:05:02 lr 0.000230 wd 0.0500 time 0.5669 (0.5754) data time 0.0007 (0.0047) model time 0.5662 (0.5709) loss 7.4648 (7.3197) grad_norm 2.2640 (2.5675) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:38:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][110/625] eta 0:04:56 lr 0.000230 wd 0.0500 time 0.5693 (0.5750) data time 0.0008 (0.0043) model time 0.5685 (0.5707) loss 6.8397 (7.3037) grad_norm 3.4219 (2.7470) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:38:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][120/625] eta 0:04:50 lr 0.000229 wd 0.0500 time 0.5713 (0.5746) data time 0.0008 (0.0040) model time 0.5705 (0.5707) loss 8.2649 (7.3110) grad_norm 13.5743 (2.8442) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:38:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][130/625] eta 0:04:44 lr 0.000229 wd 0.0500 time 0.5642 (0.5744) data time 0.0006 (0.0038) model time 0.5636 (0.5706) loss 7.8575 (7.3290) grad_norm 2.6645 (2.8575) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:38:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][140/625] eta 0:04:38 lr 0.000229 wd 0.0500 time 0.5707 (0.5741) data time 0.0008 (0.0036) model time 0.5699 (0.5705) loss 7.6279 (7.3312) grad_norm 2.1196 (2.8302) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:38:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][150/625] eta 0:04:33 lr 0.000229 wd 0.0500 time 0.7337 (0.5750) data time 0.0006 (0.0034) model time 0.7331 (0.5721) loss 8.1448 (7.3251) grad_norm 2.4928 (2.8309) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:38:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][160/625] eta 0:04:28 lr 0.000229 wd 0.0500 time 0.6533 (0.5784) data time 0.0008 (0.0032) model time 0.6525 (0.5774) loss 7.3538 (7.3120) grad_norm 2.7319 (2.8627) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:38:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][170/625] eta 0:04:23 lr 0.000229 wd 0.0500 time 0.5660 (0.5797) data time 0.0008 (0.0031) model time 0.5652 (0.5791) loss 7.5483 (7.3217) grad_norm 2.9054 (2.8538) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:38:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][180/625] eta 0:04:18 lr 0.000229 wd 0.0500 time 0.5693 (0.5798) data time 0.0009 (0.0029) model time 0.5684 (0.5793) loss 5.4245 (7.3138) grad_norm 2.5345 (2.8204) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:38:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][190/625] eta 0:04:11 lr 0.000229 wd 0.0500 time 0.5677 (0.5793) data time 0.0006 (0.0028) model time 0.5672 (0.5786) loss 6.9098 (7.2936) grad_norm 3.5434 (2.8055) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:38:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][200/625] eta 0:04:06 lr 0.000229 wd 0.0500 time 0.5680 (0.5789) data time 0.0006 (0.0027) model time 0.5674 (0.5781) loss 6.4197 (7.2847) grad_norm 3.7485 (2.7906) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:39:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][210/625] eta 0:04:00 lr 0.000229 wd 0.0500 time 0.5704 (0.5785) data time 0.0008 (0.0026) model time 0.5696 (0.5776) loss 6.9128 (7.2729) grad_norm 2.8197 (2.7754) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:39:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][220/625] eta 0:03:54 lr 0.000229 wd 0.0500 time 0.5631 (0.5782) data time 0.0008 (0.0026) model time 0.5623 (0.5772) loss 7.9932 (7.2761) grad_norm 2.2467 (2.7580) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:39:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][230/625] eta 0:03:48 lr 0.000229 wd 0.0500 time 0.5694 (0.5779) data time 0.0007 (0.0025) model time 0.5687 (0.5768) loss 7.3727 (7.2718) grad_norm 1.9447 (2.7485) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:39:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][240/625] eta 0:03:42 lr 0.000228 wd 0.0500 time 0.5672 (0.5775) data time 0.0008 (0.0024) model time 0.5664 (0.5764) loss 9.1117 (7.2839) grad_norm 2.1912 (2.7619) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:39:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][250/625] eta 0:03:36 lr 0.000228 wd 0.0500 time 0.5677 (0.5772) data time 0.0008 (0.0023) model time 0.5669 (0.5760) loss 7.6908 (7.2799) grad_norm 3.3872 (2.7822) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:39:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][260/625] eta 0:03:30 lr 0.000228 wd 0.0500 time 0.5702 (0.5770) data time 0.0008 (0.0023) model time 0.5693 (0.5757) loss 6.5176 (7.2662) grad_norm 1.9961 (2.7758) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:39:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][270/625] eta 0:03:24 lr 0.000228 wd 0.0500 time 0.5679 (0.5768) data time 0.0008 (0.0022) model time 0.5671 (0.5755) loss 8.0776 (7.2640) grad_norm 2.1322 (2.9159) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:39:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][280/625] eta 0:03:18 lr 0.000228 wd 0.0500 time 0.5690 (0.5766) data time 0.0008 (0.0022) model time 0.5682 (0.5752) loss 8.2109 (7.2702) grad_norm 1.7466 (2.9118) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:39:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][290/625] eta 0:03:13 lr 0.000228 wd 0.0500 time 0.5704 (0.5764) data time 0.0008 (0.0021) model time 0.5696 (0.5751) loss 6.3714 (7.2618) grad_norm 2.5025 (2.8938) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:39:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][300/625] eta 0:03:07 lr 0.000228 wd 0.0500 time 0.5677 (0.5762) data time 0.0006 (0.0021) model time 0.5672 (0.5748) loss 6.5454 (7.2479) grad_norm 2.0498 (2.8665) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:39:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][310/625] eta 0:03:01 lr 0.000228 wd 0.0500 time 0.5627 (0.5761) data time 0.0008 (0.0020) model time 0.5620 (0.5747) loss 5.9073 (7.2444) grad_norm 1.8818 (2.8545) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:40:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][320/625] eta 0:02:55 lr 0.000228 wd 0.0500 time 0.5683 (0.5759) data time 0.0006 (0.0020) model time 0.5676 (0.5745) loss 7.3536 (7.2549) grad_norm 2.6274 (2.8369) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:40:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][330/625] eta 0:02:49 lr 0.000228 wd 0.0500 time 0.5697 (0.5757) data time 0.0007 (0.0020) model time 0.5690 (0.5743) loss 7.3806 (7.2589) grad_norm 2.2228 (2.8252) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:40:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][340/625] eta 0:02:44 lr 0.000228 wd 0.0500 time 0.5675 (0.5755) data time 0.0008 (0.0019) model time 0.5667 (0.5741) loss 7.3407 (7.2622) grad_norm 1.9386 (2.8088) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:40:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][350/625] eta 0:02:38 lr 0.000228 wd 0.0500 time 0.5692 (0.5754) data time 0.0008 (0.0019) model time 0.5684 (0.5740) loss 7.4843 (7.2646) grad_norm 3.6136 (2.8208) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:40:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][360/625] eta 0:02:32 lr 0.000227 wd 0.0500 time 0.5666 (0.5753) data time 0.0006 (0.0019) model time 0.5660 (0.5739) loss 6.1147 (7.2636) grad_norm 1.6666 (2.8056) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:40:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][370/625] eta 0:02:26 lr 0.000227 wd 0.0500 time 0.7527 (0.5756) data time 0.0009 (0.0018) model time 0.7518 (0.5743) loss 6.8849 (7.2448) grad_norm 4.7323 (2.8049) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:40:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][380/625] eta 0:02:21 lr 0.000227 wd 0.0500 time 0.6491 (0.5771) data time 0.0008 (0.0018) model time 0.6483 (0.5760) loss 8.1915 (7.2471) grad_norm 2.1930 (2.8060) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:40:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][390/625] eta 0:02:15 lr 0.000227 wd 0.0500 time 0.5208 (0.5781) data time 0.0007 (0.0018) model time 0.5202 (0.5772) loss 6.9965 (7.2479) grad_norm 2.6063 (2.7899) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:40:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][400/625] eta 0:02:10 lr 0.000227 wd 0.0500 time 0.5687 (0.5779) data time 0.0006 (0.0018) model time 0.5682 (0.5769) loss 7.1011 (7.2549) grad_norm 4.4443 (2.8268) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:40:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][410/625] eta 0:02:04 lr 0.000227 wd 0.0500 time 0.5665 (0.5777) data time 0.0009 (0.0017) model time 0.5657 (0.5767) loss 7.1720 (7.2587) grad_norm 4.2960 (2.8324) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:41:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][420/625] eta 0:01:58 lr 0.000227 wd 0.0500 time 0.5665 (0.5776) data time 0.0008 (0.0017) model time 0.5657 (0.5766) loss 8.4921 (7.2667) grad_norm 2.0421 (2.8165) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:41:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][430/625] eta 0:01:52 lr 0.000227 wd 0.0500 time 0.5642 (0.5774) data time 0.0008 (0.0017) model time 0.5634 (0.5764) loss 7.2637 (7.2662) grad_norm 2.2383 (2.8024) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:41:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][440/625] eta 0:01:46 lr 0.000227 wd 0.0500 time 0.5682 (0.5773) data time 0.0006 (0.0017) model time 0.5677 (0.5763) loss 8.4349 (7.2645) grad_norm 1.5184 (2.7850) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:41:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][450/625] eta 0:01:41 lr 0.000227 wd 0.0500 time 0.5641 (0.5771) data time 0.0008 (0.0017) model time 0.5632 (0.5761) loss 7.2048 (7.2737) grad_norm 2.3460 (2.7724) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:41:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][460/625] eta 0:01:35 lr 0.000227 wd 0.0500 time 0.5662 (0.5770) data time 0.0006 (0.0016) model time 0.5656 (0.5759) loss 7.3940 (7.2828) grad_norm 1.9740 (2.7590) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:41:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][470/625] eta 0:01:29 lr 0.000227 wd 0.0500 time 0.5662 (0.5768) data time 0.0008 (0.0016) model time 0.5654 (0.5758) loss 8.3595 (7.2829) grad_norm 1.8927 (2.7450) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:41:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][480/625] eta 0:01:23 lr 0.000227 wd 0.0500 time 0.5679 (0.5767) data time 0.0006 (0.0016) model time 0.5673 (0.5756) loss 7.9487 (7.2884) grad_norm 1.9525 (2.7313) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:41:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][490/625] eta 0:01:17 lr 0.000226 wd 0.0500 time 0.5683 (0.5766) data time 0.0008 (0.0016) model time 0.5676 (0.5755) loss 6.5182 (7.2875) grad_norm 2.5160 (2.7226) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:41:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][500/625] eta 0:01:12 lr 0.000226 wd 0.0500 time 0.5678 (0.5765) data time 0.0008 (0.0016) model time 0.5670 (0.5754) loss 7.1208 (7.2869) grad_norm 1.6058 (2.7132) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:41:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][510/625] eta 0:01:06 lr 0.000226 wd 0.0500 time 0.5672 (0.5764) data time 0.0008 (0.0016) model time 0.5663 (0.5753) loss 8.1705 (7.2798) grad_norm 1.9871 (2.7107) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:41:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][520/625] eta 0:01:00 lr 0.000226 wd 0.0500 time 0.5686 (0.5763) data time 0.0008 (0.0015) model time 0.5678 (0.5752) loss 6.2777 (7.2734) grad_norm 1.8902 (2.7008) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:42:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][530/625] eta 0:00:54 lr 0.000226 wd 0.0500 time 0.5660 (0.5762) data time 0.0007 (0.0015) model time 0.5653 (0.5751) loss 7.3006 (7.2774) grad_norm 2.1474 (2.7049) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:42:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][540/625] eta 0:00:48 lr 0.000226 wd 0.0500 time 0.5678 (0.5760) data time 0.0008 (0.0015) model time 0.5670 (0.5749) loss 6.0448 (7.2785) grad_norm 2.6939 (2.7026) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:42:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][550/625] eta 0:00:43 lr 0.000226 wd 0.0500 time 0.5712 (0.5759) data time 0.0008 (0.0015) model time 0.5704 (0.5748) loss 7.9946 (7.2789) grad_norm 2.1227 (2.6916) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:42:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][560/625] eta 0:00:37 lr 0.000226 wd 0.0500 time 0.5659 (0.5759) data time 0.0008 (0.0015) model time 0.5651 (0.5747) loss 6.2497 (7.2771) grad_norm 2.1546 (2.6929) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:42:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][570/625] eta 0:00:31 lr 0.000226 wd 0.0500 time 0.5708 (0.5758) data time 0.0006 (0.0015) model time 0.5702 (0.5746) loss 6.3297 (7.2732) grad_norm 2.0656 (2.6914) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:42:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][580/625] eta 0:00:25 lr 0.000226 wd 0.0500 time 0.5654 (0.5757) data time 0.0008 (0.0015) model time 0.5646 (0.5746) loss 9.4613 (7.2780) grad_norm 2.2537 (2.6899) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:42:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][590/625] eta 0:00:20 lr 0.000226 wd 0.0500 time 0.5643 (0.5759) data time 0.0008 (0.0015) model time 0.5635 (0.5748) loss 8.4056 (7.2731) grad_norm 1.9051 (2.6819) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:42:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][600/625] eta 0:00:14 lr 0.000226 wd 0.0500 time 0.7721 (0.5766) data time 0.0008 (0.0014) model time 0.7714 (0.5756) loss 5.6403 (7.2706) grad_norm 1.9540 (2.6754) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:42:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][610/625] eta 0:00:08 lr 0.000225 wd 0.0500 time 0.5666 (0.5767) data time 0.0004 (0.0014) model time 0.5662 (0.5756) loss 7.1923 (7.2725) grad_norm 1.8157 (2.6669) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:42:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [221/300][620/625] eta 0:00:02 lr 0.000225 wd 0.0500 time 0.5650 (0.5767) data time 0.0006 (0.0014) model time 0.5645 (0.5757) loss 6.3602 (7.2735) grad_norm 2.4623 (2.6633) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:42:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 221 training takes 0:06:00 +[2024-07-25 13:42:58 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 13:43:00 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 13:43:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.474 (0.474) Loss 0.4888 (0.4888) Acc@1 90.527 (90.527) Acc@5 99.219 (99.219) Mem 22339MB +[2024-07-25 13:43:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.7739 (0.6081) Acc@1 81.494 (87.598) Acc@5 96.680 (98.038) Mem 22339MB +[2024-07-25 13:43:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8428 (0.6982) Acc@1 80.273 (84.947) Acc@5 96.045 (97.173) Mem 22339MB +[2024-07-25 13:43:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.595 Acc@5 97.165 +[2024-07-25 13:43:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.6% +[2024-07-25 13:43:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 84.59% +[2024-07-25 13:43:03 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 13:43:06 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 13:43:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.466 (0.466) Loss 0.5068 (0.5068) Acc@1 90.479 (90.479) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 13:43:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.156) Loss 0.7471 (0.6186) Acc@1 83.398 (87.695) Acc@5 96.875 (98.051) Mem 22339MB +[2024-07-25 13:43:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8589 (0.7073) Acc@1 79.248 (84.845) Acc@5 96.143 (97.224) Mem 22339MB +[2024-07-25 13:43:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.471 Acc@5 97.211 +[2024-07-25 13:43:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.5% +[2024-07-25 13:43:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.47% +[2024-07-25 13:43:10 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 13:43:11 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 13:43:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][0/625] eta 0:09:17 lr 0.000225 wd 0.0500 time 0.8925 (0.8925) data time 0.3735 (0.3735) model time 0.0000 (0.0000) loss 6.0439 (6.0439) grad_norm 1.7965 (1.7965) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:43:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][10/625] eta 0:06:11 lr 0.000225 wd 0.0500 time 0.5604 (0.6043) data time 0.0008 (0.0347) model time 0.0000 (0.0000) loss 6.5651 (6.6998) grad_norm 2.2628 (2.3647) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:43:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][20/625] eta 0:05:56 lr 0.000225 wd 0.0500 time 0.5704 (0.5896) data time 0.0008 (0.0186) model time 0.0000 (0.0000) loss 7.7682 (7.1671) grad_norm 2.2727 (2.2825) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:43:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][30/625] eta 0:05:46 lr 0.000225 wd 0.0500 time 0.5635 (0.5832) data time 0.0009 (0.0128) model time 0.0000 (0.0000) loss 7.9074 (7.1191) grad_norm 2.9859 (2.4343) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:43:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][40/625] eta 0:05:39 lr 0.000225 wd 0.0500 time 0.5657 (0.5802) data time 0.0008 (0.0099) model time 0.0000 (0.0000) loss 6.2241 (7.1947) grad_norm 4.1005 (2.6449) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:43:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][50/625] eta 0:05:32 lr 0.000225 wd 0.0500 time 0.5660 (0.5783) data time 0.0006 (0.0081) model time 0.0000 (0.0000) loss 8.7346 (7.1665) grad_norm 5.5444 (2.7693) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:43:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][60/625] eta 0:05:25 lr 0.000225 wd 0.0500 time 0.5648 (0.5768) data time 0.0006 (0.0069) model time 0.5642 (0.5680) loss 6.4565 (7.1834) grad_norm 2.9785 (2.8806) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:43:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][70/625] eta 0:05:19 lr 0.000225 wd 0.0500 time 0.5671 (0.5759) data time 0.0008 (0.0061) model time 0.5663 (0.5690) loss 7.6044 (7.2780) grad_norm 1.6699 (2.8180) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:43:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][80/625] eta 0:05:13 lr 0.000225 wd 0.0500 time 0.5695 (0.5753) data time 0.0006 (0.0054) model time 0.5689 (0.5693) loss 7.3325 (7.2962) grad_norm 1.8732 (2.8152) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:44:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][90/625] eta 0:05:07 lr 0.000225 wd 0.0500 time 0.5704 (0.5749) data time 0.0006 (0.0049) model time 0.5698 (0.5697) loss 8.2055 (7.2753) grad_norm 2.3250 (2.8231) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:44:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][100/625] eta 0:05:01 lr 0.000225 wd 0.0500 time 0.5684 (0.5746) data time 0.0006 (0.0045) model time 0.5677 (0.5699) loss 7.2616 (7.2764) grad_norm 2.1930 (2.7697) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:44:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][110/625] eta 0:04:55 lr 0.000224 wd 0.0500 time 0.5686 (0.5742) data time 0.0008 (0.0042) model time 0.5678 (0.5698) loss 7.4504 (7.2468) grad_norm 2.2258 (2.7180) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:44:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][120/625] eta 0:04:50 lr 0.000224 wd 0.0500 time 0.5691 (0.5744) data time 0.0006 (0.0039) model time 0.5684 (0.5707) loss 7.3109 (7.2558) grad_norm 2.4429 (2.6964) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 13:44:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][130/625] eta 0:04:44 lr 0.000224 wd 0.0500 time 0.5678 (0.5742) data time 0.0008 (0.0037) model time 0.5670 (0.5708) loss 8.4664 (7.2856) grad_norm 1.6169 (2.6611) loss_scale 1024.0000 (551.0840) mem 22339MB +[2024-07-25 13:44:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][140/625] eta 0:04:38 lr 0.000224 wd 0.0500 time 0.5677 (0.5740) data time 0.0009 (0.0035) model time 0.5668 (0.5708) loss 7.4326 (7.2573) grad_norm 2.9713 (2.6253) loss_scale 1024.0000 (584.6241) mem 22339MB +[2024-07-25 13:44:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][150/625] eta 0:04:32 lr 0.000224 wd 0.0500 time 0.5664 (0.5739) data time 0.0008 (0.0033) model time 0.5656 (0.5708) loss 8.1036 (7.2610) grad_norm 2.5039 (2.5976) loss_scale 1024.0000 (613.7219) mem 22339MB +[2024-07-25 13:44:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][160/625] eta 0:04:26 lr 0.000224 wd 0.0500 time 0.5737 (0.5741) data time 0.0008 (0.0031) model time 0.5728 (0.5713) loss 7.5844 (7.2301) grad_norm 2.6639 (2.6227) loss_scale 1024.0000 (639.2050) mem 22339MB +[2024-07-25 13:44:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][170/625] eta 0:04:21 lr 0.000224 wd 0.0500 time 0.5675 (0.5740) data time 0.0006 (0.0030) model time 0.5669 (0.5713) loss 6.9592 (7.2418) grad_norm 2.2950 (2.6225) loss_scale 1024.0000 (661.7076) mem 22339MB +[2024-07-25 13:44:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][180/625] eta 0:04:15 lr 0.000224 wd 0.0500 time 0.5696 (0.5739) data time 0.0006 (0.0029) model time 0.5690 (0.5713) loss 8.0175 (7.2381) grad_norm 2.6141 (2.6136) loss_scale 1024.0000 (681.7238) mem 22339MB +[2024-07-25 13:45:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][190/625] eta 0:04:10 lr 0.000224 wd 0.0500 time 0.5693 (0.5755) data time 0.0009 (0.0028) model time 0.5685 (0.5736) loss 7.0438 (7.2394) grad_norm 2.0045 (2.5849) loss_scale 1024.0000 (699.6440) mem 22339MB +[2024-07-25 13:45:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][200/625] eta 0:04:06 lr 0.000224 wd 0.0500 time 0.5672 (0.5791) data time 0.0007 (0.0027) model time 0.5665 (0.5785) loss 7.2615 (7.2140) grad_norm 3.6990 (2.5923) loss_scale 1024.0000 (715.7811) mem 22339MB +[2024-07-25 13:45:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][210/625] eta 0:04:00 lr 0.000224 wd 0.0500 time 0.5696 (0.5786) data time 0.0008 (0.0026) model time 0.5688 (0.5779) loss 8.1368 (7.2128) grad_norm 1.8563 (2.5981) loss_scale 1024.0000 (730.3886) mem 22339MB +[2024-07-25 13:45:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][220/625] eta 0:03:54 lr 0.000224 wd 0.0500 time 0.5719 (0.5783) data time 0.0008 (0.0025) model time 0.5710 (0.5774) loss 7.7639 (7.2099) grad_norm 2.6741 (2.5938) loss_scale 1024.0000 (743.6742) mem 22339MB +[2024-07-25 13:45:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][230/625] eta 0:03:48 lr 0.000223 wd 0.0500 time 0.5683 (0.5780) data time 0.0008 (0.0024) model time 0.5675 (0.5771) loss 7.6904 (7.2102) grad_norm 2.1079 (2.5834) loss_scale 1024.0000 (755.8095) mem 22339MB +[2024-07-25 13:45:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][240/625] eta 0:03:42 lr 0.000223 wd 0.0500 time 0.5704 (0.5779) data time 0.0006 (0.0024) model time 0.5698 (0.5769) loss 5.9002 (7.2121) grad_norm 1.7003 (2.5796) loss_scale 1024.0000 (766.9378) mem 22339MB +[2024-07-25 13:45:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][250/625] eta 0:03:36 lr 0.000223 wd 0.0500 time 0.5691 (0.5778) data time 0.0008 (0.0023) model time 0.5683 (0.5769) loss 7.1371 (7.2315) grad_norm 3.9613 (2.6037) loss_scale 1024.0000 (777.1793) mem 22339MB +[2024-07-25 13:45:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][260/625] eta 0:03:30 lr 0.000223 wd 0.0500 time 0.5673 (0.5775) data time 0.0008 (0.0022) model time 0.5665 (0.5765) loss 7.6977 (7.2496) grad_norm 2.1825 (2.5914) loss_scale 1024.0000 (786.6360) mem 22339MB +[2024-07-25 13:45:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][270/625] eta 0:03:24 lr 0.000223 wd 0.0500 time 0.5679 (0.5773) data time 0.0006 (0.0022) model time 0.5673 (0.5763) loss 6.3934 (7.2332) grad_norm 1.7579 (2.6429) loss_scale 1024.0000 (795.3948) mem 22339MB +[2024-07-25 13:45:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][280/625] eta 0:03:19 lr 0.000223 wd 0.0500 time 0.5675 (0.5772) data time 0.0009 (0.0022) model time 0.5666 (0.5761) loss 7.7202 (7.2242) grad_norm 2.1907 (2.6347) loss_scale 1024.0000 (803.5302) mem 22339MB +[2024-07-25 13:45:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][290/625] eta 0:03:13 lr 0.000223 wd 0.0500 time 0.5693 (0.5771) data time 0.0007 (0.0022) model time 0.5686 (0.5759) loss 6.9771 (7.2219) grad_norm 2.2798 (2.6215) loss_scale 1024.0000 (811.1065) mem 22339MB +[2024-07-25 13:46:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][300/625] eta 0:03:07 lr 0.000223 wd 0.0500 time 0.5623 (0.5771) data time 0.0007 (0.0021) model time 0.5616 (0.5760) loss 7.5690 (7.2154) grad_norm 2.5695 (2.6304) loss_scale 1024.0000 (818.1794) mem 22339MB +[2024-07-25 13:46:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][310/625] eta 0:03:01 lr 0.000223 wd 0.0500 time 0.5683 (0.5773) data time 0.0006 (0.0021) model time 0.5677 (0.5762) loss 7.3426 (7.2201) grad_norm 2.5528 (2.6216) loss_scale 1024.0000 (824.7974) mem 22339MB +[2024-07-25 13:46:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][320/625] eta 0:02:56 lr 0.000223 wd 0.0500 time 0.5690 (0.5771) data time 0.0008 (0.0020) model time 0.5683 (0.5759) loss 8.4565 (7.2319) grad_norm 3.4836 (2.6141) loss_scale 1024.0000 (831.0031) mem 22339MB +[2024-07-25 13:46:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][330/625] eta 0:02:50 lr 0.000223 wd 0.0500 time 0.5657 (0.5769) data time 0.0008 (0.0020) model time 0.5648 (0.5757) loss 8.1375 (7.2421) grad_norm 2.0977 (2.6185) loss_scale 1024.0000 (836.8338) mem 22339MB +[2024-07-25 13:46:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][340/625] eta 0:02:44 lr 0.000223 wd 0.0500 time 0.5619 (0.5770) data time 0.0007 (0.0020) model time 0.5612 (0.5759) loss 6.3963 (7.2413) grad_norm 1.9382 (2.6101) loss_scale 1024.0000 (842.3226) mem 22339MB +[2024-07-25 13:46:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][350/625] eta 0:02:38 lr 0.000222 wd 0.0500 time 0.5668 (0.5769) data time 0.0008 (0.0019) model time 0.5660 (0.5758) loss 7.7336 (7.2380) grad_norm 1.7361 (2.5917) loss_scale 1024.0000 (847.4986) mem 22339MB +[2024-07-25 13:46:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][360/625] eta 0:02:32 lr 0.000222 wd 0.0500 time 0.5700 (0.5768) data time 0.0008 (0.0019) model time 0.5693 (0.5756) loss 6.2898 (7.2332) grad_norm 6.4383 (2.6103) loss_scale 1024.0000 (852.3878) mem 22339MB +[2024-07-25 13:46:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][370/625] eta 0:02:27 lr 0.000222 wd 0.0500 time 0.5706 (0.5768) data time 0.0008 (0.0019) model time 0.5698 (0.5757) loss 7.6999 (7.2332) grad_norm 4.5015 (2.6105) loss_scale 1024.0000 (857.0135) mem 22339MB +[2024-07-25 13:46:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][380/625] eta 0:02:21 lr 0.000222 wd 0.0500 time 0.5692 (0.5768) data time 0.0007 (0.0018) model time 0.5685 (0.5757) loss 6.7592 (7.2264) grad_norm 1.9668 (2.6084) loss_scale 1024.0000 (861.3963) mem 22339MB +[2024-07-25 13:46:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][390/625] eta 0:02:15 lr 0.000222 wd 0.0500 time 0.5693 (0.5766) data time 0.0008 (0.0018) model time 0.5684 (0.5755) loss 6.5490 (7.2187) grad_norm 2.2238 (2.6057) loss_scale 1024.0000 (865.5550) mem 22339MB +[2024-07-25 13:47:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][400/625] eta 0:02:09 lr 0.000222 wd 0.0500 time 0.7449 (0.5769) data time 0.0006 (0.0018) model time 0.7442 (0.5758) loss 8.1774 (7.2214) grad_norm 1.8679 (2.5966) loss_scale 1024.0000 (869.5062) mem 22339MB +[2024-07-25 13:47:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][410/625] eta 0:02:04 lr 0.000222 wd 0.0500 time 0.5677 (0.5771) data time 0.0007 (0.0018) model time 0.5671 (0.5760) loss 8.0124 (7.2162) grad_norm 2.9116 (2.6014) loss_scale 1024.0000 (873.2652) mem 22339MB +[2024-07-25 13:47:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][420/625] eta 0:01:58 lr 0.000222 wd 0.0500 time 0.5659 (0.5786) data time 0.0008 (0.0017) model time 0.5651 (0.5777) loss 7.6250 (7.2170) grad_norm 2.3802 (2.6080) loss_scale 1024.0000 (876.8456) mem 22339MB +[2024-07-25 13:47:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][430/625] eta 0:01:52 lr 0.000222 wd 0.0500 time 0.5696 (0.5783) data time 0.0006 (0.0017) model time 0.5690 (0.5775) loss 7.9582 (7.2185) grad_norm 2.3088 (2.6098) loss_scale 1024.0000 (880.2599) mem 22339MB +[2024-07-25 13:47:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][440/625] eta 0:01:46 lr 0.000222 wd 0.0500 time 0.5669 (0.5781) data time 0.0007 (0.0017) model time 0.5662 (0.5773) loss 7.2445 (7.2053) grad_norm 1.7696 (2.6019) loss_scale 1024.0000 (883.5193) mem 22339MB +[2024-07-25 13:47:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][450/625] eta 0:01:41 lr 0.000222 wd 0.0500 time 0.5682 (0.5780) data time 0.0006 (0.0017) model time 0.5676 (0.5771) loss 8.0501 (7.2081) grad_norm 2.0563 (2.6008) loss_scale 1024.0000 (886.6341) mem 22339MB +[2024-07-25 13:47:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][460/625] eta 0:01:35 lr 0.000222 wd 0.0500 time 0.5678 (0.5778) data time 0.0006 (0.0017) model time 0.5672 (0.5769) loss 7.9198 (7.2061) grad_norm 3.1846 (2.6414) loss_scale 1024.0000 (889.6139) mem 22339MB +[2024-07-25 13:47:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][470/625] eta 0:01:29 lr 0.000221 wd 0.0500 time 0.5644 (0.5777) data time 0.0006 (0.0016) model time 0.5638 (0.5767) loss 6.8890 (7.1951) grad_norm 2.6130 (2.6426) loss_scale 1024.0000 (892.4671) mem 22339MB +[2024-07-25 13:47:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][480/625] eta 0:01:23 lr 0.000221 wd 0.0500 time 0.5686 (0.5775) data time 0.0008 (0.0016) model time 0.5678 (0.5766) loss 8.0396 (7.2040) grad_norm 2.0159 (2.6474) loss_scale 1024.0000 (895.2017) mem 22339MB +[2024-07-25 13:47:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][490/625] eta 0:01:17 lr 0.000221 wd 0.0500 time 0.5704 (0.5774) data time 0.0009 (0.0016) model time 0.5695 (0.5764) loss 7.3119 (7.1909) grad_norm 4.4764 (2.6449) loss_scale 1024.0000 (897.8248) mem 22339MB +[2024-07-25 13:48:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][500/625] eta 0:01:12 lr 0.000221 wd 0.0500 time 0.5705 (0.5772) data time 0.0008 (0.0016) model time 0.5697 (0.5762) loss 7.8240 (7.1958) grad_norm 2.3070 (2.6383) loss_scale 1024.0000 (900.3433) mem 22339MB +[2024-07-25 13:48:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][510/625] eta 0:01:06 lr 0.000221 wd 0.0500 time 0.5678 (0.5771) data time 0.0008 (0.0016) model time 0.5670 (0.5761) loss 7.9633 (7.1952) grad_norm 2.3825 (2.6336) loss_scale 1024.0000 (902.7632) mem 22339MB +[2024-07-25 13:48:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][520/625] eta 0:01:00 lr 0.000221 wd 0.0500 time 0.5663 (0.5769) data time 0.0006 (0.0016) model time 0.5657 (0.5759) loss 6.8204 (7.1903) grad_norm 4.1428 (2.6302) loss_scale 1024.0000 (905.0902) mem 22339MB +[2024-07-25 13:48:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][530/625] eta 0:00:54 lr 0.000221 wd 0.0500 time 0.5706 (0.5768) data time 0.0007 (0.0015) model time 0.5700 (0.5758) loss 6.1503 (7.1889) grad_norm 2.2626 (2.6270) loss_scale 1024.0000 (907.3296) mem 22339MB +[2024-07-25 13:48:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][540/625] eta 0:00:49 lr 0.000221 wd 0.0500 time 0.5683 (0.5767) data time 0.0008 (0.0015) model time 0.5675 (0.5757) loss 7.4231 (7.1822) grad_norm 2.2197 (2.6229) loss_scale 1024.0000 (909.4861) mem 22339MB +[2024-07-25 13:48:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][550/625] eta 0:00:43 lr 0.000221 wd 0.0500 time 0.7697 (0.5769) data time 0.0008 (0.0015) model time 0.7689 (0.5759) loss 7.7996 (7.1844) grad_norm 3.0373 (2.6419) loss_scale 1024.0000 (911.5644) mem 22339MB +[2024-07-25 13:48:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][560/625] eta 0:00:37 lr 0.000221 wd 0.0500 time 0.5690 (0.5767) data time 0.0006 (0.0015) model time 0.5684 (0.5757) loss 5.6229 (7.1835) grad_norm 2.1887 (2.6571) loss_scale 1024.0000 (913.5686) mem 22339MB +[2024-07-25 13:48:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][570/625] eta 0:00:31 lr 0.000221 wd 0.0500 time 0.5671 (0.5766) data time 0.0007 (0.0015) model time 0.5664 (0.5756) loss 6.0797 (7.1841) grad_norm 3.3611 (2.6819) loss_scale 1024.0000 (915.5026) mem 22339MB +[2024-07-25 13:48:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][580/625] eta 0:00:25 lr 0.000221 wd 0.0500 time 0.5672 (0.5765) data time 0.0006 (0.0015) model time 0.5666 (0.5755) loss 7.2342 (7.1866) grad_norm 2.2718 (2.6850) loss_scale 1024.0000 (917.3701) mem 22339MB +[2024-07-25 13:48:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][590/625] eta 0:00:20 lr 0.000221 wd 0.0500 time 0.5721 (0.5766) data time 0.0008 (0.0015) model time 0.5713 (0.5756) loss 6.9496 (7.1877) grad_norm 2.8039 (2.6761) loss_scale 1024.0000 (919.1743) mem 22339MB +[2024-07-25 13:48:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][600/625] eta 0:00:14 lr 0.000220 wd 0.0500 time 0.5670 (0.5765) data time 0.0006 (0.0015) model time 0.5664 (0.5755) loss 5.8376 (7.1881) grad_norm 2.7116 (2.6724) loss_scale 1024.0000 (920.9185) mem 22339MB +[2024-07-25 13:49:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][610/625] eta 0:00:08 lr 0.000220 wd 0.0500 time 0.5777 (0.5764) data time 0.0006 (0.0015) model time 0.5771 (0.5754) loss 6.8144 (7.1822) grad_norm 3.9580 (2.6721) loss_scale 1024.0000 (922.6056) mem 22339MB +[2024-07-25 13:49:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [222/300][620/625] eta 0:00:02 lr 0.000220 wd 0.0500 time 0.7274 (0.5766) data time 0.0006 (0.0014) model time 0.7269 (0.5755) loss 7.2226 (7.1838) grad_norm 2.8219 (2.6700) loss_scale 1024.0000 (924.2383) mem 22339MB +[2024-07-25 13:49:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 222 training takes 0:06:00 +[2024-07-25 13:49:11 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 13:49:13 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 13:49:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.477 (0.477) Loss 0.4944 (0.4944) Acc@1 90.771 (90.771) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-25 13:49:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.158) Loss 0.7578 (0.6112) Acc@1 83.008 (87.749) Acc@5 96.582 (98.038) Mem 22339MB +[2024-07-25 13:49:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.143) Loss 0.8535 (0.7001) Acc@1 79.736 (85.035) Acc@5 96.143 (97.189) Mem 22339MB +[2024-07-25 13:49:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.627 Acc@5 97.167 +[2024-07-25 13:49:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.6% +[2024-07-25 13:49:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 84.63% +[2024-07-25 13:49:16 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-25 13:49:18 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-25 13:49:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.492 (0.492) Loss 0.5068 (0.5068) Acc@1 90.479 (90.479) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-25 13:49:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.159) Loss 0.7480 (0.6185) Acc@1 83.252 (87.686) Acc@5 96.875 (98.051) Mem 22339MB +[2024-07-25 13:49:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.143) Loss 0.8574 (0.7070) Acc@1 79.346 (84.863) Acc@5 96.143 (97.226) Mem 22339MB +[2024-07-25 13:49:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.491 Acc@5 97.221 +[2024-07-25 13:49:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.5% +[2024-07-25 13:49:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.49% +[2024-07-25 13:49:21 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 13:49:23 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 13:49:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][0/625] eta 0:09:26 lr 0.000220 wd 0.0500 time 0.9067 (0.9067) data time 0.3847 (0.3847) model time 0.0000 (0.0000) loss 7.6589 (7.6589) grad_norm 2.3147 (2.3147) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:49:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][10/625] eta 0:06:29 lr 0.000220 wd 0.0500 time 0.7390 (0.6338) data time 0.0006 (0.0357) model time 0.0000 (0.0000) loss 6.2413 (7.1464) grad_norm 2.2765 (2.0837) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:49:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][20/625] eta 0:06:17 lr 0.000220 wd 0.0500 time 0.5691 (0.6246) data time 0.0006 (0.0191) model time 0.0000 (0.0000) loss 7.0238 (7.0909) grad_norm 2.7772 (2.5111) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:49:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][30/625] eta 0:06:01 lr 0.000220 wd 0.0500 time 0.5670 (0.6072) data time 0.0008 (0.0132) model time 0.0000 (0.0000) loss 8.5993 (7.1441) grad_norm 2.3057 (2.5939) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:49:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][40/625] eta 0:05:50 lr 0.000220 wd 0.0500 time 0.5693 (0.5985) data time 0.0006 (0.0102) model time 0.0000 (0.0000) loss 7.3376 (7.1198) grad_norm 3.0505 (2.5661) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:49:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][50/625] eta 0:05:42 lr 0.000220 wd 0.0500 time 0.6938 (0.5952) data time 0.0007 (0.0084) model time 0.0000 (0.0000) loss 7.5195 (7.1492) grad_norm 1.8917 (2.4781) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:49:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][60/625] eta 0:05:33 lr 0.000220 wd 0.0500 time 0.5685 (0.5902) data time 0.0007 (0.0071) model time 0.5678 (0.5640) loss 7.2103 (7.2097) grad_norm 1.7693 (2.4058) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:50:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][70/625] eta 0:05:25 lr 0.000220 wd 0.0500 time 0.5658 (0.5874) data time 0.0008 (0.0062) model time 0.5649 (0.5666) loss 8.1196 (7.2451) grad_norm 2.1604 (2.3796) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:50:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][80/625] eta 0:05:18 lr 0.000220 wd 0.0500 time 0.5708 (0.5852) data time 0.0008 (0.0056) model time 0.5700 (0.5674) loss 6.7780 (7.2626) grad_norm 1.6868 (2.3554) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:50:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][90/625] eta 0:05:12 lr 0.000219 wd 0.0500 time 0.5687 (0.5835) data time 0.0006 (0.0050) model time 0.5681 (0.5678) loss 6.6929 (7.2008) grad_norm 2.4285 (2.5118) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:50:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][100/625] eta 0:05:05 lr 0.000219 wd 0.0500 time 0.5668 (0.5823) data time 0.0007 (0.0046) model time 0.5661 (0.5684) loss 8.5358 (7.2165) grad_norm 1.9951 (2.4944) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:50:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][110/625] eta 0:04:59 lr 0.000219 wd 0.0500 time 0.5690 (0.5813) data time 0.0008 (0.0043) model time 0.5682 (0.5686) loss 6.2988 (7.2024) grad_norm 1.5529 (2.4547) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:50:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][120/625] eta 0:04:53 lr 0.000219 wd 0.0500 time 0.5677 (0.5803) data time 0.0006 (0.0040) model time 0.5671 (0.5687) loss 6.9537 (7.1929) grad_norm 1.8448 (2.4532) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:50:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][130/625] eta 0:04:46 lr 0.000219 wd 0.0500 time 0.5662 (0.5795) data time 0.0007 (0.0037) model time 0.5655 (0.5687) loss 6.3609 (7.1794) grad_norm 2.7847 (2.4620) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:50:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][140/625] eta 0:04:40 lr 0.000219 wd 0.0500 time 0.5680 (0.5789) data time 0.0009 (0.0035) model time 0.5671 (0.5688) loss 8.1431 (7.1900) grad_norm 2.0917 (2.4410) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:50:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][150/625] eta 0:04:34 lr 0.000219 wd 0.0500 time 0.5675 (0.5783) data time 0.0008 (0.0034) model time 0.5667 (0.5689) loss 7.0377 (7.1933) grad_norm 2.0525 (2.4119) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:50:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][160/625] eta 0:04:28 lr 0.000219 wd 0.0500 time 0.5676 (0.5777) data time 0.0006 (0.0032) model time 0.5670 (0.5688) loss 8.0975 (7.1883) grad_norm 15.0513 (2.4680) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:51:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][170/625] eta 0:04:22 lr 0.000219 wd 0.0500 time 0.5694 (0.5773) data time 0.0009 (0.0031) model time 0.5686 (0.5689) loss 7.5811 (7.2097) grad_norm 2.0925 (2.4513) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:51:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][180/625] eta 0:04:16 lr 0.000219 wd 0.0500 time 0.5700 (0.5769) data time 0.0006 (0.0029) model time 0.5693 (0.5689) loss 8.0111 (7.2129) grad_norm 2.2143 (2.4474) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:51:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][190/625] eta 0:04:10 lr 0.000219 wd 0.0500 time 0.5659 (0.5765) data time 0.0006 (0.0028) model time 0.5653 (0.5688) loss 7.2814 (7.1942) grad_norm 4.5160 (2.4471) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:51:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][200/625] eta 0:04:04 lr 0.000219 wd 0.0500 time 0.5677 (0.5761) data time 0.0006 (0.0027) model time 0.5671 (0.5688) loss 7.5374 (7.2046) grad_norm 2.2628 (2.4475) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:51:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][210/625] eta 0:03:58 lr 0.000219 wd 0.0500 time 0.5691 (0.5758) data time 0.0006 (0.0026) model time 0.5684 (0.5688) loss 5.6490 (7.2080) grad_norm 7.8862 (2.5265) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:51:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][220/625] eta 0:03:53 lr 0.000218 wd 0.0500 time 0.5692 (0.5763) data time 0.0008 (0.0025) model time 0.5684 (0.5698) loss 7.1690 (7.2095) grad_norm 7.1574 (2.5597) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:51:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][230/625] eta 0:03:48 lr 0.000218 wd 0.0500 time 0.7229 (0.5773) data time 0.0006 (0.0025) model time 0.7223 (0.5714) loss 6.4818 (7.2035) grad_norm 3.4422 (2.5764) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:51:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][240/625] eta 0:03:43 lr 0.000218 wd 0.0500 time 0.5711 (0.5798) data time 0.0008 (0.0024) model time 0.5703 (0.5748) loss 8.2382 (7.1918) grad_norm 2.2523 (2.5752) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:51:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][250/625] eta 0:03:37 lr 0.000218 wd 0.0500 time 0.5732 (0.5794) data time 0.0006 (0.0023) model time 0.5726 (0.5746) loss 7.3360 (7.1984) grad_norm 3.3640 (2.5723) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:51:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][260/625] eta 0:03:31 lr 0.000218 wd 0.0500 time 0.5617 (0.5792) data time 0.0006 (0.0023) model time 0.5611 (0.5745) loss 7.8230 (7.1916) grad_norm 1.8756 (2.5651) loss_scale 1024.0000 (1024.0000) mem 22339MB +[2024-07-25 13:51:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][270/625] eta 0:03:25 lr 0.000218 wd 0.0500 time 0.5693 (0.5789) data time 0.0007 (0.0022) model time 0.5685 (0.5743) loss 6.7464 (7.1848) grad_norm 1.7803 (inf) loss_scale 512.0000 (1020.2214) mem 22339MB +[2024-07-25 13:52:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][280/625] eta 0:03:19 lr 0.000218 wd 0.0500 time 0.5694 (0.5790) data time 0.0006 (0.0022) model time 0.5688 (0.5746) loss 7.4249 (7.1807) grad_norm 1.6947 (inf) loss_scale 512.0000 (1002.1352) mem 22339MB +[2024-07-25 13:52:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][290/625] eta 0:03:13 lr 0.000218 wd 0.0500 time 0.5733 (0.5787) data time 0.0006 (0.0021) model time 0.5727 (0.5744) loss 7.3436 (7.1800) grad_norm 2.5567 (inf) loss_scale 512.0000 (985.2921) mem 22339MB +[2024-07-25 13:52:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][300/625] eta 0:03:07 lr 0.000218 wd 0.0500 time 0.5720 (0.5784) data time 0.0006 (0.0021) model time 0.5713 (0.5742) loss 6.9917 (7.1838) grad_norm 3.0967 (inf) loss_scale 512.0000 (969.5681) mem 22339MB +[2024-07-25 13:52:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][310/625] eta 0:03:02 lr 0.000218 wd 0.0500 time 0.5712 (0.5782) data time 0.0008 (0.0020) model time 0.5704 (0.5741) loss 8.0590 (7.1760) grad_norm 2.5106 (inf) loss_scale 512.0000 (954.8553) mem 22339MB +[2024-07-25 13:52:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][320/625] eta 0:02:56 lr 0.000218 wd 0.0500 time 0.5697 (0.5780) data time 0.0008 (0.0020) model time 0.5689 (0.5740) loss 7.4664 (7.1790) grad_norm 1.5739 (inf) loss_scale 512.0000 (941.0592) mem 22339MB +[2024-07-25 13:52:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][330/625] eta 0:02:50 lr 0.000218 wd 0.0500 time 0.5732 (0.5778) data time 0.0007 (0.0020) model time 0.5725 (0.5739) loss 7.3223 (7.1733) grad_norm 1.8168 (inf) loss_scale 512.0000 (928.0967) mem 22339MB +[2024-07-25 13:52:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][340/625] eta 0:02:44 lr 0.000217 wd 0.0500 time 0.5685 (0.5776) data time 0.0006 (0.0019) model time 0.5679 (0.5737) loss 6.8076 (7.1692) grad_norm 1.7146 (inf) loss_scale 512.0000 (915.8944) mem 22339MB +[2024-07-25 13:52:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][350/625] eta 0:02:38 lr 0.000217 wd 0.0500 time 0.5653 (0.5774) data time 0.0006 (0.0019) model time 0.5647 (0.5736) loss 8.5588 (7.1671) grad_norm 3.1512 (inf) loss_scale 512.0000 (904.3875) mem 22339MB +[2024-07-25 13:52:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][360/625] eta 0:02:32 lr 0.000217 wd 0.0500 time 0.5650 (0.5772) data time 0.0008 (0.0019) model time 0.5642 (0.5734) loss 6.4087 (7.1666) grad_norm 3.0445 (inf) loss_scale 512.0000 (893.5180) mem 22339MB +[2024-07-25 13:52:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][370/625] eta 0:02:27 lr 0.000217 wd 0.0500 time 0.5685 (0.5770) data time 0.0007 (0.0018) model time 0.5678 (0.5733) loss 8.3234 (7.1788) grad_norm 2.3468 (inf) loss_scale 512.0000 (883.2345) mem 22339MB +[2024-07-25 13:53:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][380/625] eta 0:02:21 lr 0.000217 wd 0.0500 time 0.5641 (0.5768) data time 0.0008 (0.0018) model time 0.5633 (0.5731) loss 7.3360 (7.1816) grad_norm 1.9309 (inf) loss_scale 512.0000 (873.4908) mem 22339MB +[2024-07-25 13:53:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][390/625] eta 0:02:15 lr 0.000217 wd 0.0500 time 0.5686 (0.5766) data time 0.0006 (0.0018) model time 0.5679 (0.5731) loss 5.2318 (7.1779) grad_norm 2.7207 (inf) loss_scale 512.0000 (864.2455) mem 22339MB +[2024-07-25 13:53:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][400/625] eta 0:02:09 lr 0.000217 wd 0.0500 time 0.5630 (0.5766) data time 0.0008 (0.0018) model time 0.5622 (0.5730) loss 7.4376 (7.1766) grad_norm 4.1721 (inf) loss_scale 512.0000 (855.4613) mem 22339MB +[2024-07-25 13:53:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][410/625] eta 0:02:03 lr 0.000217 wd 0.0500 time 0.5697 (0.5764) data time 0.0006 (0.0017) model time 0.5691 (0.5730) loss 6.6165 (7.1753) grad_norm 1.5656 (inf) loss_scale 512.0000 (847.1046) mem 22339MB +[2024-07-25 13:53:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][420/625] eta 0:01:58 lr 0.000217 wd 0.0500 time 0.5737 (0.5763) data time 0.0008 (0.0017) model time 0.5728 (0.5729) loss 6.5505 (7.1786) grad_norm 4.6984 (inf) loss_scale 512.0000 (839.1449) mem 22339MB +[2024-07-25 13:53:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][430/625] eta 0:01:52 lr 0.000217 wd 0.0500 time 0.5647 (0.5761) data time 0.0006 (0.0017) model time 0.5641 (0.5728) loss 6.8064 (7.1757) grad_norm 5.2795 (inf) loss_scale 512.0000 (831.5545) mem 22339MB +[2024-07-25 13:53:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][440/625] eta 0:01:46 lr 0.000217 wd 0.0500 time 0.5649 (0.5767) data time 0.0008 (0.0017) model time 0.5641 (0.5734) loss 7.5102 (7.1791) grad_norm 4.9575 (inf) loss_scale 512.0000 (824.3084) mem 22339MB +[2024-07-25 13:53:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][450/625] eta 0:01:41 lr 0.000217 wd 0.0500 time 0.7334 (0.5776) data time 0.0008 (0.0017) model time 0.7326 (0.5745) loss 7.8284 (7.1867) grad_norm 2.9593 (inf) loss_scale 512.0000 (817.3836) mem 22339MB +[2024-07-25 13:53:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][460/625] eta 0:01:35 lr 0.000217 wd 0.0500 time 0.5631 (0.5792) data time 0.0007 (0.0017) model time 0.5624 (0.5763) loss 7.0814 (7.1918) grad_norm 2.0621 (inf) loss_scale 512.0000 (810.7592) mem 22339MB +[2024-07-25 13:53:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][470/625] eta 0:01:29 lr 0.000216 wd 0.0500 time 0.5711 (0.5791) data time 0.0009 (0.0016) model time 0.5702 (0.5764) loss 7.3237 (7.1868) grad_norm 2.3126 (inf) loss_scale 512.0000 (804.4161) mem 22339MB +[2024-07-25 13:54:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][480/625] eta 0:01:23 lr 0.000216 wd 0.0500 time 0.5666 (0.5790) data time 0.0008 (0.0016) model time 0.5658 (0.5763) loss 7.0623 (7.1908) grad_norm 3.1303 (inf) loss_scale 512.0000 (798.3368) mem 22339MB +[2024-07-25 13:54:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][490/625] eta 0:01:18 lr 0.000216 wd 0.0500 time 0.5203 (0.5792) data time 0.0009 (0.0016) model time 0.5195 (0.5766) loss 7.9794 (7.2008) grad_norm 2.2840 (inf) loss_scale 512.0000 (792.5051) mem 22339MB +[2024-07-25 13:54:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][500/625] eta 0:01:12 lr 0.000216 wd 0.0500 time 0.5717 (0.5791) data time 0.0006 (0.0016) model time 0.5711 (0.5764) loss 8.0005 (7.2001) grad_norm 1.7359 (inf) loss_scale 512.0000 (786.9062) mem 22339MB +[2024-07-25 13:54:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][510/625] eta 0:01:06 lr 0.000216 wd 0.0500 time 0.5671 (0.5789) data time 0.0009 (0.0016) model time 0.5662 (0.5763) loss 7.4089 (7.1966) grad_norm 2.3400 (inf) loss_scale 512.0000 (781.5264) mem 22339MB +[2024-07-25 13:54:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][520/625] eta 0:01:00 lr 0.000216 wd 0.0500 time 0.5700 (0.5788) data time 0.0006 (0.0016) model time 0.5695 (0.5762) loss 7.9483 (7.1909) grad_norm 3.9810 (inf) loss_scale 512.0000 (776.3532) mem 22339MB +[2024-07-25 13:54:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][530/625] eta 0:00:54 lr 0.000216 wd 0.0500 time 0.5631 (0.5787) data time 0.0008 (0.0015) model time 0.5623 (0.5762) loss 7.6124 (7.1927) grad_norm 3.6075 (inf) loss_scale 512.0000 (771.3748) mem 22339MB +[2024-07-25 13:54:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][540/625] eta 0:00:49 lr 0.000216 wd 0.0500 time 0.5670 (0.5786) data time 0.0006 (0.0015) model time 0.5664 (0.5761) loss 7.3601 (7.1944) grad_norm 1.9080 (inf) loss_scale 512.0000 (766.5804) mem 22339MB +[2024-07-25 13:54:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][550/625] eta 0:00:43 lr 0.000216 wd 0.0500 time 0.5707 (0.5787) data time 0.0007 (0.0015) model time 0.5700 (0.5762) loss 7.1400 (7.1930) grad_norm 2.8099 (inf) loss_scale 512.0000 (761.9601) mem 22339MB +[2024-07-25 13:54:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-25 13:54:46 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 13:54:49 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 13:57:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-25 13:57:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-25 13:57:55 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-25 13:58:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-25 13:58:06 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-25 13:58:07 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-25 13:58:07 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-25 13:58:07 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 223) +[2024-07-25 13:58:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-25 13:58:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-25 13:58:22 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 13:58:26 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 14:01:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-25 14:01:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-25 14:02:04 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-25 14:02:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-25 14:02:24 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-25 14:02:25 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-25 14:02:25 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-25 14:02:25 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 223) +[2024-07-25 14:02:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-25 14:02:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][560/625] eta 0:05:34 lr 0.000216 wd 0.0500 time 1.7176 (5.1501) data time 0.0008 (0.3279) model time 1.7169 (4.8222) loss 7.8442 (8.1383) grad_norm 2.5603 (2.2392) loss_scale 512.0000 (512.0000) mem 22342MB +[2024-07-25 14:02:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][570/625] eta 0:01:13 lr 0.000216 wd 0.0500 time 0.5776 (1.3394) data time 0.0006 (0.0553) model time 0.5770 (1.2841) loss 6.3582 (7.3685) grad_norm 2.7405 (2.8809) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-25 14:02:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][580/625] eta 0:00:44 lr 0.000216 wd 0.0500 time 0.5765 (0.9967) data time 0.0008 (0.0306) model time 0.5756 (0.9661) loss 7.1874 (7.5267) grad_norm 3.2485 (3.8869) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-25 14:02:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][590/625] eta 0:00:30 lr 0.000215 wd 0.0500 time 0.5714 (0.8672) data time 0.0006 (0.0213) model time 0.5708 (0.8459) loss 6.6538 (7.5282) grad_norm 1.7969 (3.3855) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-25 14:03:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][600/625] eta 0:00:19 lr 0.000215 wd 0.0500 time 0.5749 (0.7985) data time 0.0010 (0.0164) model time 0.5739 (0.7821) loss 7.5590 (7.4279) grad_norm 1.8431 (3.1051) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-25 14:03:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][610/625] eta 0:00:11 lr 0.000215 wd 0.0500 time 0.5476 (0.7583) data time 0.0004 (0.0135) model time 0.5472 (0.7448) loss 7.4702 (7.3953) grad_norm 2.0722 (2.9454) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-25 14:03:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [223/300][620/625] eta 0:00:03 lr 0.000215 wd 0.0500 time 0.5806 (0.7324) data time 0.0004 (0.0114) model time 0.5802 (0.7210) loss 8.1057 (7.3624) grad_norm 3.0741 (2.9516) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-25 14:03:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 223 training takes 0:00:47 +[2024-07-25 14:03:17 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 14:03:21 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 14:03:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.459 (0.459) Loss 0.5098 (0.5098) Acc@1 90.430 (90.430) Acc@5 98.730 (98.730) Mem 22341MB +[2024-07-25 14:03:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.155) Loss 0.7739 (0.6245) Acc@1 82.080 (87.509) Acc@5 96.680 (98.056) Mem 22341MB +[2024-07-25 14:03:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.141) Loss 0.8667 (0.7150) Acc@1 79.785 (84.794) Acc@5 95.996 (97.221) Mem 22341MB +[2024-07-25 14:03:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.461 Acc@5 97.207 +[2024-07-25 14:03:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.5% +[2024-07-25 14:03:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.812 (0.812) Loss 0.5068 (0.5068) Acc@1 90.430 (90.430) Acc@5 98.877 (98.877) Mem 22341MB +[2024-07-25 14:03:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.124 (0.187) Loss 0.7485 (0.6182) Acc@1 83.398 (87.695) Acc@5 96.875 (98.047) Mem 22341MB +[2024-07-25 14:03:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.124 (0.157) Loss 0.8574 (0.7065) Acc@1 79.346 (84.877) Acc@5 96.143 (97.235) Mem 22341MB +[2024-07-25 14:03:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.499 Acc@5 97.227 +[2024-07-25 14:03:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.5% +[2024-07-25 14:03:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.50% +[2024-07-25 14:03:30 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-25 14:03:33 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-25 14:03:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][0/625] eta 0:11:10 lr 0.000215 wd 0.0500 time 1.0725 (1.0725) data time 0.4233 (0.4233) model time 0.0000 (0.0000) loss 7.4475 (7.4475) grad_norm 1.8979 (1.8979) loss_scale 512.0000 (512.0000) mem 22337MB +[2024-07-25 14:03:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][10/625] eta 0:06:23 lr 0.000215 wd 0.0500 time 0.5790 (0.6228) data time 0.0008 (0.0394) model time 0.0000 (0.0000) loss 6.7947 (7.1057) grad_norm 3.1717 (2.5765) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 14:03:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][20/625] eta 0:06:03 lr 0.000215 wd 0.0500 time 0.5770 (0.6012) data time 0.0008 (0.0210) model time 0.0000 (0.0000) loss 7.4808 (7.0024) grad_norm 1.9489 (2.5430) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-25 14:03:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-25 14:03:51 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-25 14:03:52 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-25 18:48:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-25 18:48:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 19:13:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 19:13:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 19:13:58 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 19:14:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-27 19:14:14 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-27 19:14:14 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-27 19:14:15 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-27 19:14:15 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 224) +[2024-07-27 19:14:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-27 19:14:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][30/625] eta 0:53:11 lr 0.000215 wd 0.0500 time 1.6542 (5.3641) data time 0.0008 (0.4681) model time 0.0000 (0.0000) loss 8.3449 (8.7164) grad_norm 2.3186 (2.0996) loss_scale 512.0000 (512.0000) mem 22342MB +[2024-07-27 19:14:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][40/625] eta 0:13:21 lr 0.000215 wd 0.0500 time 0.5698 (1.3693) data time 0.0006 (0.0788) model time 0.0000 (0.0000) loss 6.2141 (7.5578) grad_norm 1.9412 (2.3697) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:14:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][50/625] eta 0:09:40 lr 0.000215 wd 0.0500 time 0.5712 (1.0093) data time 0.0009 (0.0434) model time 0.0000 (0.0000) loss 7.1208 (7.5289) grad_norm 1.9590 (2.2930) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:14:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][60/625] eta 0:08:12 lr 0.000215 wd 0.0500 time 0.5706 (0.8725) data time 0.0006 (0.0301) model time 0.5699 (0.5706) loss 6.4140 (7.4248) grad_norm 1.8864 (2.1832) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:14:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][70/625] eta 0:07:24 lr 0.000215 wd 0.0500 time 0.5698 (0.8007) data time 0.0009 (0.0232) model time 0.5689 (0.5702) loss 8.0849 (7.3398) grad_norm 2.4008 (2.2174) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:14:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][80/625] eta 0:06:53 lr 0.000215 wd 0.0500 time 0.5157 (0.7593) data time 0.0006 (0.0190) model time 0.5151 (0.5749) loss 6.5058 (7.3123) grad_norm 1.5719 (2.2320) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:15:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][90/625] eta 0:06:32 lr 0.000214 wd 0.0500 time 0.5782 (0.7334) data time 0.0006 (0.0160) model time 0.5775 (0.5806) loss 8.4533 (7.3077) grad_norm 2.0465 (2.2606) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:15:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][100/625] eta 0:06:13 lr 0.000214 wd 0.0500 time 0.5795 (0.7117) data time 0.0009 (0.0139) model time 0.5787 (0.5797) loss 7.5584 (7.3183) grad_norm 2.1444 (2.2596) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:15:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][110/625] eta 0:05:57 lr 0.000214 wd 0.0500 time 0.5788 (0.6951) data time 0.0008 (0.0124) model time 0.5780 (0.5789) loss 7.0366 (7.3346) grad_norm 4.0588 (2.3235) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:15:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][120/625] eta 0:05:44 lr 0.000214 wd 0.0500 time 0.5749 (0.6822) data time 0.0006 (0.0111) model time 0.5743 (0.5784) loss 6.1232 (7.2719) grad_norm 2.3412 (2.3355) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:15:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][130/625] eta 0:05:32 lr 0.000214 wd 0.0500 time 0.5749 (0.6717) data time 0.0007 (0.0101) model time 0.5742 (0.5778) loss 8.7297 (7.3190) grad_norm 4.5632 (2.3807) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:15:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][140/625] eta 0:05:21 lr 0.000214 wd 0.0500 time 0.5878 (0.6632) data time 0.0009 (0.0093) model time 0.5869 (0.5776) loss 8.2163 (7.3024) grad_norm 1.9778 (2.3801) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:15:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][150/625] eta 0:05:11 lr 0.000214 wd 0.0500 time 0.5765 (0.6559) data time 0.0006 (0.0086) model time 0.5759 (0.5772) loss 7.0333 (7.2867) grad_norm 2.7345 (2.4369) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:15:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][160/625] eta 0:05:02 lr 0.000214 wd 0.0500 time 0.5780 (0.6498) data time 0.0009 (0.0080) model time 0.5771 (0.5769) loss 7.4809 (7.2992) grad_norm 1.8996 (2.4558) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:15:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][170/625] eta 0:04:53 lr 0.000214 wd 0.0500 time 0.5813 (0.6447) data time 0.0008 (0.0075) model time 0.5805 (0.5769) loss 7.9867 (7.2964) grad_norm 4.5977 (2.4738) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:15:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][180/625] eta 0:04:44 lr 0.000214 wd 0.0500 time 0.5778 (0.6403) data time 0.0009 (0.0071) model time 0.5769 (0.5770) loss 8.1664 (7.2777) grad_norm 3.2690 (2.4852) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:16:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][190/625] eta 0:04:36 lr 0.000214 wd 0.0500 time 0.5806 (0.6365) data time 0.0008 (0.0067) model time 0.5799 (0.5770) loss 7.3076 (7.2814) grad_norm 1.8146 (2.5298) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:16:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][200/625] eta 0:04:29 lr 0.000214 wd 0.0500 time 0.5793 (0.6331) data time 0.0009 (0.0064) model time 0.5784 (0.5770) loss 6.3301 (7.2806) grad_norm 2.5807 (2.5138) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:16:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][210/625] eta 0:04:21 lr 0.000214 wd 0.0500 time 0.5760 (0.6300) data time 0.0008 (0.0061) model time 0.5752 (0.5769) loss 8.0887 (7.2741) grad_norm 2.1191 (2.5264) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:16:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][220/625] eta 0:04:14 lr 0.000213 wd 0.0500 time 0.5853 (0.6272) data time 0.0008 (0.0058) model time 0.5845 (0.5769) loss 7.5459 (7.2703) grad_norm 2.1937 (2.5216) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:16:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][230/625] eta 0:04:06 lr 0.000213 wd 0.0500 time 0.5769 (0.6247) data time 0.0007 (0.0056) model time 0.5762 (0.5768) loss 7.0586 (7.2469) grad_norm 2.6645 (2.5267) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:16:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][240/625] eta 0:03:59 lr 0.000213 wd 0.0500 time 0.5785 (0.6225) data time 0.0009 (0.0053) model time 0.5776 (0.5768) loss 7.1965 (7.2385) grad_norm 1.8090 (2.5158) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:16:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][250/625] eta 0:03:52 lr 0.000213 wd 0.0500 time 0.5803 (0.6205) data time 0.0007 (0.0051) model time 0.5797 (0.5768) loss 7.1225 (7.2390) grad_norm 1.9361 (2.5481) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:16:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][260/625] eta 0:03:45 lr 0.000213 wd 0.0500 time 0.5772 (0.6187) data time 0.0007 (0.0049) model time 0.5765 (0.5768) loss 7.3855 (7.2436) grad_norm 2.4756 (2.5553) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:16:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][270/625] eta 0:03:39 lr 0.000213 wd 0.0500 time 0.5761 (0.6170) data time 0.0006 (0.0048) model time 0.5754 (0.5769) loss 8.1482 (7.2445) grad_norm 2.7031 (2.5603) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:16:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][280/625] eta 0:03:32 lr 0.000213 wd 0.0500 time 0.5782 (0.6155) data time 0.0006 (0.0046) model time 0.5777 (0.5769) loss 8.2053 (7.2377) grad_norm 1.8686 (2.5558) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:17:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][290/625] eta 0:03:25 lr 0.000213 wd 0.0500 time 0.5794 (0.6140) data time 0.0008 (0.0045) model time 0.5786 (0.5768) loss 7.9636 (7.2234) grad_norm 1.8532 (2.5507) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:17:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][300/625] eta 0:03:19 lr 0.000213 wd 0.0500 time 0.6776 (0.6130) data time 0.0008 (0.0043) model time 0.6767 (0.5772) loss 6.5098 (7.1986) grad_norm 2.4630 (2.5551) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:17:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][310/625] eta 0:03:12 lr 0.000213 wd 0.0500 time 0.5780 (0.6124) data time 0.0008 (0.0042) model time 0.5772 (0.5779) loss 6.4455 (7.2044) grad_norm 2.0157 (2.5582) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:17:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][320/625] eta 0:03:06 lr 0.000213 wd 0.0500 time 0.5790 (0.6113) data time 0.0009 (0.0041) model time 0.5782 (0.5779) loss 6.9367 (7.2038) grad_norm 2.9638 (2.5514) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:17:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][330/625] eta 0:03:00 lr 0.000213 wd 0.0500 time 0.5778 (0.6102) data time 0.0009 (0.0040) model time 0.5769 (0.5779) loss 7.7936 (7.1955) grad_norm 2.0559 (2.5545) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:17:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][340/625] eta 0:02:53 lr 0.000212 wd 0.0500 time 0.5790 (0.6092) data time 0.0009 (0.0039) model time 0.5781 (0.5779) loss 8.0920 (7.1978) grad_norm 2.8013 (2.5443) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:17:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][350/625] eta 0:02:47 lr 0.000212 wd 0.0500 time 0.5736 (0.6082) data time 0.0010 (0.0038) model time 0.5726 (0.5779) loss 7.4061 (7.2169) grad_norm 2.0083 (2.5397) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:17:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][360/625] eta 0:02:40 lr 0.000212 wd 0.0500 time 0.5768 (0.6072) data time 0.0008 (0.0037) model time 0.5760 (0.5778) loss 7.6250 (7.2255) grad_norm 2.3267 (2.5627) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:17:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][370/625] eta 0:02:34 lr 0.000212 wd 0.0500 time 0.5763 (0.6063) data time 0.0007 (0.0036) model time 0.5756 (0.5777) loss 7.1615 (7.2320) grad_norm 2.9165 (2.5938) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:17:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][380/625] eta 0:02:28 lr 0.000212 wd 0.0500 time 0.5793 (0.6055) data time 0.0006 (0.0036) model time 0.5787 (0.5776) loss 7.3844 (7.2351) grad_norm 2.3290 (2.5937) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:17:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][390/625] eta 0:02:22 lr 0.000212 wd 0.0500 time 0.5783 (0.6048) data time 0.0006 (0.0035) model time 0.5776 (0.5777) loss 8.5933 (7.2355) grad_norm 2.6929 (2.5960) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:18:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][400/625] eta 0:02:15 lr 0.000212 wd 0.0500 time 0.5817 (0.6041) data time 0.0008 (0.0034) model time 0.5809 (0.5777) loss 6.4815 (7.2294) grad_norm 2.9407 (2.5977) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:18:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][410/625] eta 0:02:09 lr 0.000212 wd 0.0500 time 0.5787 (0.6034) data time 0.0007 (0.0034) model time 0.5781 (0.5777) loss 7.1303 (7.2220) grad_norm 2.2397 (2.5995) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:18:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][420/625] eta 0:02:03 lr 0.000212 wd 0.0500 time 0.5778 (0.6027) data time 0.0007 (0.0033) model time 0.5771 (0.5776) loss 6.4928 (7.2130) grad_norm 1.7510 (2.5888) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:18:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][430/625] eta 0:01:57 lr 0.000212 wd 0.0500 time 0.5766 (0.6021) data time 0.0006 (0.0032) model time 0.5759 (0.5776) loss 7.6653 (7.2186) grad_norm 2.8845 (2.5807) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:18:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][440/625] eta 0:01:51 lr 0.000212 wd 0.0500 time 0.5770 (0.6015) data time 0.0008 (0.0032) model time 0.5762 (0.5776) loss 7.0636 (7.2174) grad_norm 1.9152 (2.5831) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:18:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][450/625] eta 0:01:45 lr 0.000212 wd 0.0500 time 0.5778 (0.6009) data time 0.0007 (0.0031) model time 0.5771 (0.5775) loss 6.7651 (7.2145) grad_norm 2.2068 (2.5773) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:18:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][460/625] eta 0:01:39 lr 0.000212 wd 0.0500 time 0.5810 (0.6003) data time 0.0006 (0.0031) model time 0.5803 (0.5775) loss 6.3154 (7.2166) grad_norm 2.7213 (2.5699) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:18:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][470/625] eta 0:01:32 lr 0.000211 wd 0.0500 time 0.5870 (0.5999) data time 0.0006 (0.0030) model time 0.5864 (0.5775) loss 6.9416 (7.2154) grad_norm 4.6122 (2.5696) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:18:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][480/625] eta 0:01:26 lr 0.000211 wd 0.0500 time 0.5777 (0.5994) data time 0.0008 (0.0030) model time 0.5768 (0.5775) loss 6.6259 (7.2079) grad_norm 1.9673 (2.5718) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:18:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][490/625] eta 0:01:20 lr 0.000211 wd 0.0500 time 0.5782 (0.5989) data time 0.0006 (0.0029) model time 0.5776 (0.5775) loss 6.5608 (7.2032) grad_norm 2.5692 (2.5669) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:19:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][500/625] eta 0:01:14 lr 0.000211 wd 0.0500 time 0.5762 (0.5985) data time 0.0009 (0.0029) model time 0.5753 (0.5775) loss 7.6255 (7.1955) grad_norm 1.7534 (2.5607) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:19:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][510/625] eta 0:01:08 lr 0.000211 wd 0.0500 time 0.5759 (0.5980) data time 0.0007 (0.0029) model time 0.5752 (0.5774) loss 7.2268 (7.1960) grad_norm 1.7116 (2.5514) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:19:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][520/625] eta 0:01:02 lr 0.000211 wd 0.0500 time 0.5739 (0.5979) data time 0.0009 (0.0028) model time 0.5730 (0.5777) loss 8.4481 (7.2033) grad_norm 2.4057 (2.5621) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:19:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][530/625] eta 0:00:56 lr 0.000211 wd 0.0500 time 0.5837 (0.5978) data time 0.0009 (0.0028) model time 0.5829 (0.5780) loss 6.4003 (7.2031) grad_norm 1.9758 (2.5799) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:19:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][540/625] eta 0:00:50 lr 0.000211 wd 0.0500 time 0.5772 (0.5975) data time 0.0007 (0.0027) model time 0.5765 (0.5781) loss 7.3440 (7.2089) grad_norm 4.6596 (2.5776) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 19:19:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-27 19:19:29 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 19:19:35 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 19:22:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 19:22:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 19:22:51 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 19:23:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-27 19:23:05 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-27 19:23:06 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-27 19:23:06 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-27 19:23:06 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 224) +[2024-07-27 19:23:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-27 19:23:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][550/625] eta 0:05:31 lr 0.000211 wd 0.0500 time 0.5872 (4.4207) data time 0.0007 (0.2434) model time 0.5864 (4.1773) loss 6.0903 (7.1296) grad_norm 1.5081 (1.8708) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 19:23:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][560/625] eta 0:01:35 lr 0.000211 wd 0.0500 time 0.5854 (1.4724) data time 0.0010 (0.0570) model time 0.5843 (1.4155) loss 7.8691 (7.4769) grad_norm 2.2227 (2.4451) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 19:23:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][570/625] eta 0:00:59 lr 0.000211 wd 0.0500 time 0.5841 (1.0873) data time 0.0008 (0.0326) model time 0.5833 (1.0546) loss 8.0840 (7.5366) grad_norm 2.3462 (2.7576) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 19:23:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][580/625] eta 0:00:42 lr 0.000211 wd 0.0500 time 0.5805 (0.9352) data time 0.0008 (0.0230) model time 0.5797 (0.9122) loss 7.6483 (7.5151) grad_norm 2.5416 (2.6598) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 19:23:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][590/625] eta 0:00:29 lr 0.000210 wd 0.0500 time 0.5847 (0.8537) data time 0.0010 (0.0179) model time 0.5837 (0.8357) loss 7.6406 (7.3692) grad_norm 3.0101 (2.6556) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 19:23:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][600/625] eta 0:00:20 lr 0.000210 wd 0.0500 time 0.5846 (0.8060) data time 0.0010 (0.0147) model time 0.5836 (0.7913) loss 7.2340 (7.3502) grad_norm 2.1278 (2.6266) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 19:23:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][610/625] eta 0:00:11 lr 0.000210 wd 0.0500 time 0.5876 (0.7755) data time 0.0008 (0.0126) model time 0.5869 (0.7629) loss 6.2512 (7.3318) grad_norm 2.0551 (2.5341) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 19:24:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [224/300][620/625] eta 0:00:03 lr 0.000210 wd 0.0500 time 0.5893 (0.7499) data time 0.0007 (0.0110) model time 0.5886 (0.7390) loss 7.1057 (7.3018) grad_norm 1.9801 (2.5166) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 19:24:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 224 training takes 0:00:57 +[2024-07-27 19:24:07 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 19:24:11 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 19:24:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.523 (0.523) Loss 0.5078 (0.5078) Acc@1 90.576 (90.576) Acc@5 98.828 (98.828) Mem 22344MB +[2024-07-27 19:24:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.162) Loss 0.7612 (0.6130) Acc@1 81.934 (87.660) Acc@5 96.680 (97.994) Mem 22344MB +[2024-07-27 19:24:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.144) Loss 0.8584 (0.7012) Acc@1 79.639 (84.975) Acc@5 96.045 (97.180) Mem 22344MB +[2024-07-27 19:24:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.575 Acc@5 97.173 +[2024-07-27 19:24:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.6% +[2024-07-27 19:24:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.920 (0.920) Loss 0.5063 (0.5063) Acc@1 90.430 (90.430) Acc@5 98.877 (98.877) Mem 22344MB +[2024-07-27 19:24:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.197) Loss 0.7476 (0.6181) Acc@1 83.301 (87.695) Acc@5 96.875 (98.056) Mem 22344MB +[2024-07-27 19:24:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.163) Loss 0.8574 (0.7063) Acc@1 79.443 (84.877) Acc@5 96.143 (97.235) Mem 22344MB +[2024-07-27 19:24:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.497 Acc@5 97.225 +[2024-07-27 19:24:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.5% +[2024-07-27 19:24:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.50% +[2024-07-27 19:24:21 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-27 19:24:22 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-27 19:24:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][0/625] eta 0:13:37 lr 0.000210 wd 0.0500 time 1.3085 (1.3085) data time 0.4651 (0.4651) model time 0.0000 (0.0000) loss 6.2275 (6.2275) grad_norm 3.0013 (3.0013) loss_scale 512.0000 (512.0000) mem 22337MB +[2024-07-27 19:24:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][10/625] eta 0:06:44 lr 0.000210 wd 0.0500 time 0.5962 (0.6576) data time 0.0010 (0.0433) model time 0.0000 (0.0000) loss 7.9986 (7.2254) grad_norm 2.5339 (3.0851) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 19:24:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][20/625] eta 0:06:19 lr 0.000210 wd 0.0500 time 0.5907 (0.6265) data time 0.0008 (0.0231) model time 0.0000 (0.0000) loss 7.4808 (7.2901) grad_norm 2.7108 (2.8529) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 19:24:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][30/625] eta 0:06:05 lr 0.000210 wd 0.0500 time 0.5920 (0.6150) data time 0.0008 (0.0160) model time 0.0000 (0.0000) loss 6.1538 (7.3048) grad_norm 2.0750 (2.6001) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 19:24:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][40/625] eta 0:05:56 lr 0.000210 wd 0.0500 time 0.5924 (0.6092) data time 0.0010 (0.0123) model time 0.0000 (0.0000) loss 7.6965 (7.2570) grad_norm 2.4492 (2.5677) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 19:24:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][50/625] eta 0:05:48 lr 0.000210 wd 0.0500 time 0.5908 (0.6053) data time 0.0007 (0.0101) model time 0.0000 (0.0000) loss 7.3552 (7.2812) grad_norm 2.3713 (2.5121) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 19:24:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][60/625] eta 0:05:40 lr 0.000210 wd 0.0500 time 0.5924 (0.6033) data time 0.0010 (0.0086) model time 0.5915 (0.5916) loss 6.6014 (7.2869) grad_norm 5.0090 (2.5448) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 19:25:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][70/625] eta 0:05:34 lr 0.000210 wd 0.0500 time 0.5970 (0.6020) data time 0.0007 (0.0075) model time 0.5962 (0.5926) loss 7.2919 (7.2513) grad_norm 2.2425 (2.5252) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 19:25:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][80/625] eta 0:05:27 lr 0.000210 wd 0.0500 time 0.5949 (0.6012) data time 0.0007 (0.0067) model time 0.5941 (0.5932) loss 6.3129 (7.2045) grad_norm 1.8941 (2.5271) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 19:25:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][90/625] eta 0:05:21 lr 0.000209 wd 0.0500 time 0.5978 (0.6004) data time 0.0011 (0.0061) model time 0.5968 (0.5932) loss 7.8203 (7.2331) grad_norm 4.1385 (2.6224) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 19:25:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][100/625] eta 0:05:14 lr 0.000209 wd 0.0500 time 0.5935 (0.5996) data time 0.0010 (0.0056) model time 0.5925 (0.5926) loss 5.9043 (7.2231) grad_norm 2.2143 (2.6076) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 19:25:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][110/625] eta 0:05:08 lr 0.000209 wd 0.0500 time 0.5915 (0.5987) data time 0.0009 (0.0052) model time 0.5906 (0.5920) loss 7.4237 (7.2224) grad_norm 1.9179 (2.6615) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 19:25:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][120/625] eta 0:05:02 lr 0.000209 wd 0.0500 time 0.5929 (0.5982) data time 0.0011 (0.0049) model time 0.5919 (0.5920) loss 6.3379 (7.1753) grad_norm 3.3576 (2.6618) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 19:25:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][130/625] eta 0:04:56 lr 0.000209 wd 0.0500 time 0.7079 (0.5985) data time 0.0011 (0.0046) model time 0.7068 (0.5931) loss 8.1789 (7.1549) grad_norm 1.8809 (2.6612) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 19:25:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][140/625] eta 0:04:49 lr 0.000209 wd 0.0500 time 0.5975 (0.5978) data time 0.0011 (0.0043) model time 0.5963 (0.5925) loss 6.8630 (7.1456) grad_norm 4.9390 (2.7101) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 19:25:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][150/625] eta 0:04:43 lr 0.000209 wd 0.0500 time 0.5905 (0.5975) data time 0.0011 (0.0041) model time 0.5894 (0.5925) loss 8.6576 (7.1523) grad_norm 2.2069 (2.7065) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 19:25:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][160/625] eta 0:04:37 lr 0.000209 wd 0.0500 time 0.5934 (0.5973) data time 0.0008 (0.0039) model time 0.5926 (0.5925) loss 7.8865 (7.1596) grad_norm 4.6110 (2.7175) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 19:26:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][170/625] eta 0:04:31 lr 0.000209 wd 0.0500 time 0.5928 (0.5970) data time 0.0010 (0.0038) model time 0.5919 (0.5924) loss 5.9783 (7.1730) grad_norm 2.6796 (2.7298) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 19:26:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][180/625] eta 0:04:25 lr 0.000209 wd 0.0500 time 0.5898 (0.5966) data time 0.0011 (0.0036) model time 0.5887 (0.5922) loss 7.5324 (7.1514) grad_norm 5.0356 (2.7971) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 19:26:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][190/625] eta 0:04:19 lr 0.000209 wd 0.0500 time 0.5906 (0.5963) data time 0.0008 (0.0035) model time 0.5897 (0.5920) loss 8.0664 (7.1501) grad_norm 2.3527 (inf) loss_scale 256.0000 (507.9791) mem 22339MB +[2024-07-27 19:26:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][200/625] eta 0:04:13 lr 0.000209 wd 0.0500 time 0.5919 (0.5969) data time 0.0010 (0.0033) model time 0.5909 (0.5930) loss 6.4639 (7.1522) grad_norm 2.7425 (inf) loss_scale 256.0000 (495.4428) mem 22339MB +[2024-07-27 19:26:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][210/625] eta 0:04:07 lr 0.000209 wd 0.0500 time 0.5997 (0.5968) data time 0.0008 (0.0032) model time 0.5990 (0.5931) loss 8.3722 (7.1502) grad_norm 2.6354 (inf) loss_scale 256.0000 (484.0948) mem 22339MB +[2024-07-27 19:26:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][220/625] eta 0:04:01 lr 0.000208 wd 0.0500 time 0.5934 (0.5968) data time 0.0011 (0.0031) model time 0.5923 (0.5931) loss 6.3366 (7.1264) grad_norm 2.6500 (inf) loss_scale 256.0000 (473.7738) mem 22339MB +[2024-07-27 19:26:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][230/625] eta 0:03:55 lr 0.000208 wd 0.0500 time 0.5919 (0.5966) data time 0.0008 (0.0030) model time 0.5912 (0.5931) loss 6.0692 (7.1181) grad_norm 2.2164 (inf) loss_scale 256.0000 (464.3463) mem 22339MB +[2024-07-27 19:26:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][240/625] eta 0:03:49 lr 0.000208 wd 0.0500 time 0.6347 (0.5966) data time 0.0011 (0.0030) model time 0.6336 (0.5932) loss 6.6101 (7.1455) grad_norm 2.3799 (inf) loss_scale 256.0000 (455.7012) mem 22339MB +[2024-07-27 19:26:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][250/625] eta 0:03:43 lr 0.000208 wd 0.0500 time 0.5888 (0.5964) data time 0.0008 (0.0029) model time 0.5880 (0.5930) loss 7.2375 (7.1541) grad_norm 4.8515 (inf) loss_scale 256.0000 (447.7450) mem 22339MB +[2024-07-27 19:26:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][260/625] eta 0:03:37 lr 0.000208 wd 0.0500 time 0.5925 (0.5961) data time 0.0012 (0.0028) model time 0.5913 (0.5929) loss 7.5414 (7.1498) grad_norm 2.7283 (inf) loss_scale 256.0000 (440.3985) mem 22339MB +[2024-07-27 19:27:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][270/625] eta 0:03:31 lr 0.000208 wd 0.0500 time 0.5918 (0.5959) data time 0.0009 (0.0027) model time 0.5909 (0.5926) loss 7.2891 (7.1578) grad_norm 1.8079 (inf) loss_scale 256.0000 (433.5941) mem 22339MB +[2024-07-27 19:27:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][280/625] eta 0:03:25 lr 0.000208 wd 0.0500 time 0.5971 (0.5958) data time 0.0011 (0.0027) model time 0.5960 (0.5926) loss 7.5888 (7.1555) grad_norm 2.2341 (inf) loss_scale 256.0000 (427.2740) mem 22339MB +[2024-07-27 19:27:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][290/625] eta 0:03:19 lr 0.000208 wd 0.0500 time 0.5944 (0.5957) data time 0.0008 (0.0026) model time 0.5937 (0.5926) loss 7.0746 (7.1529) grad_norm 1.9041 (inf) loss_scale 256.0000 (421.3883) mem 22339MB +[2024-07-27 19:27:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][300/625] eta 0:03:13 lr 0.000208 wd 0.0500 time 0.5920 (0.5956) data time 0.0010 (0.0026) model time 0.5910 (0.5926) loss 7.7748 (7.1574) grad_norm 2.1333 (inf) loss_scale 256.0000 (415.8937) mem 22339MB +[2024-07-27 19:27:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][310/625] eta 0:03:07 lr 0.000208 wd 0.0500 time 0.5933 (0.5957) data time 0.0008 (0.0025) model time 0.5925 (0.5928) loss 6.3339 (7.1526) grad_norm 1.7471 (inf) loss_scale 256.0000 (410.7524) mem 22339MB +[2024-07-27 19:27:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][320/625] eta 0:03:01 lr 0.000208 wd 0.0500 time 0.5916 (0.5955) data time 0.0007 (0.0025) model time 0.5908 (0.5926) loss 6.6051 (7.1519) grad_norm 2.9201 (inf) loss_scale 256.0000 (405.9315) mem 22339MB +[2024-07-27 19:27:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][330/625] eta 0:02:55 lr 0.000208 wd 0.0500 time 0.5924 (0.5954) data time 0.0010 (0.0024) model time 0.5914 (0.5925) loss 7.4299 (7.1615) grad_norm 3.0349 (inf) loss_scale 256.0000 (401.4018) mem 22339MB +[2024-07-27 19:27:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][340/625] eta 0:02:49 lr 0.000207 wd 0.0500 time 0.5886 (0.5953) data time 0.0008 (0.0024) model time 0.5879 (0.5924) loss 8.1895 (7.1647) grad_norm 2.3117 (inf) loss_scale 256.0000 (397.1378) mem 22339MB +[2024-07-27 19:27:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][350/625] eta 0:02:43 lr 0.000207 wd 0.0500 time 0.5949 (0.5951) data time 0.0008 (0.0023) model time 0.5941 (0.5923) loss 7.2801 (7.1731) grad_norm 2.6674 (inf) loss_scale 256.0000 (393.1168) mem 22339MB +[2024-07-27 19:27:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][360/625] eta 0:02:37 lr 0.000207 wd 0.0500 time 0.5933 (0.5955) data time 0.0010 (0.0023) model time 0.5923 (0.5928) loss 7.2343 (7.1846) grad_norm 3.4519 (inf) loss_scale 256.0000 (389.3186) mem 22339MB +[2024-07-27 19:28:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][370/625] eta 0:02:31 lr 0.000207 wd 0.0500 time 0.5921 (0.5954) data time 0.0012 (0.0023) model time 0.5909 (0.5928) loss 6.7419 (7.1765) grad_norm 1.8380 (inf) loss_scale 256.0000 (385.7251) mem 22339MB +[2024-07-27 19:28:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][380/625] eta 0:02:25 lr 0.000207 wd 0.0500 time 0.5895 (0.5953) data time 0.0008 (0.0022) model time 0.5887 (0.5927) loss 6.3257 (7.1676) grad_norm 4.2386 (inf) loss_scale 256.0000 (382.3202) mem 22339MB +[2024-07-27 19:28:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][390/625] eta 0:02:19 lr 0.000207 wd 0.0500 time 0.5931 (0.5952) data time 0.0010 (0.0022) model time 0.5921 (0.5927) loss 6.6124 (7.1623) grad_norm 3.1565 (inf) loss_scale 256.0000 (379.0895) mem 22339MB +[2024-07-27 19:28:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][400/625] eta 0:02:13 lr 0.000207 wd 0.0500 time 0.5918 (0.5951) data time 0.0010 (0.0022) model time 0.5908 (0.5926) loss 6.0840 (7.1651) grad_norm 2.3266 (inf) loss_scale 256.0000 (376.0200) mem 22339MB +[2024-07-27 19:28:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][410/625] eta 0:02:07 lr 0.000207 wd 0.0500 time 0.5920 (0.5951) data time 0.0010 (0.0022) model time 0.5909 (0.5927) loss 7.4929 (7.1735) grad_norm 2.3636 (inf) loss_scale 256.0000 (373.0998) mem 22339MB +[2024-07-27 19:28:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][420/625] eta 0:02:02 lr 0.000207 wd 0.0500 time 0.5883 (0.5954) data time 0.0008 (0.0021) model time 0.5875 (0.5930) loss 7.3664 (7.1723) grad_norm 2.6167 (inf) loss_scale 256.0000 (370.3183) mem 22339MB +[2024-07-27 19:28:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][430/625] eta 0:01:56 lr 0.000207 wd 0.0500 time 0.5886 (0.5954) data time 0.0008 (0.0021) model time 0.5878 (0.5930) loss 7.4367 (7.1777) grad_norm 2.3565 (inf) loss_scale 256.0000 (367.6659) mem 22339MB +[2024-07-27 19:28:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][440/625] eta 0:01:50 lr 0.000207 wd 0.0500 time 0.5986 (0.5954) data time 0.0010 (0.0021) model time 0.5976 (0.5930) loss 6.6165 (7.1825) grad_norm 2.3372 (inf) loss_scale 256.0000 (365.1338) mem 22339MB +[2024-07-27 19:28:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][450/625] eta 0:01:44 lr 0.000207 wd 0.0500 time 0.5903 (0.5953) data time 0.0008 (0.0021) model time 0.5895 (0.5930) loss 6.4138 (7.1780) grad_norm 2.8963 (inf) loss_scale 256.0000 (362.7140) mem 22339MB +[2024-07-27 19:28:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][460/625] eta 0:01:38 lr 0.000207 wd 0.0500 time 0.5915 (0.5953) data time 0.0008 (0.0020) model time 0.5907 (0.5930) loss 7.4805 (7.1772) grad_norm 1.9994 (inf) loss_scale 256.0000 (360.3991) mem 22339MB +[2024-07-27 19:29:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][470/625] eta 0:01:32 lr 0.000206 wd 0.0500 time 0.5861 (0.5952) data time 0.0008 (0.0020) model time 0.5853 (0.5929) loss 7.4719 (7.1759) grad_norm 2.4655 (inf) loss_scale 256.0000 (358.1826) mem 22339MB +[2024-07-27 19:29:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][480/625] eta 0:01:26 lr 0.000206 wd 0.0500 time 0.5890 (0.5951) data time 0.0010 (0.0020) model time 0.5880 (0.5929) loss 8.0183 (7.1806) grad_norm 3.6017 (inf) loss_scale 256.0000 (356.0582) mem 22339MB +[2024-07-27 19:29:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][490/625] eta 0:01:20 lr 0.000206 wd 0.0500 time 0.5901 (0.5951) data time 0.0008 (0.0020) model time 0.5893 (0.5928) loss 8.0680 (7.1868) grad_norm 2.0701 (inf) loss_scale 256.0000 (354.0204) mem 22339MB +[2024-07-27 19:29:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][500/625] eta 0:01:14 lr 0.000206 wd 0.0500 time 0.5975 (0.5951) data time 0.0010 (0.0019) model time 0.5966 (0.5929) loss 8.0850 (7.1894) grad_norm 2.6179 (inf) loss_scale 256.0000 (352.0639) mem 22339MB +[2024-07-27 19:29:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][510/625] eta 0:01:08 lr 0.000206 wd 0.0500 time 0.5933 (0.5951) data time 0.0010 (0.0019) model time 0.5922 (0.5929) loss 7.9022 (7.1893) grad_norm 2.2267 (inf) loss_scale 256.0000 (350.1840) mem 22339MB +[2024-07-27 19:29:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][520/625] eta 0:01:02 lr 0.000206 wd 0.0500 time 0.5965 (0.5951) data time 0.0010 (0.0019) model time 0.5955 (0.5929) loss 7.3974 (7.1906) grad_norm 2.2555 (inf) loss_scale 256.0000 (348.3762) mem 22339MB +[2024-07-27 19:29:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][530/625] eta 0:00:56 lr 0.000206 wd 0.0500 time 0.5942 (0.5950) data time 0.0010 (0.0019) model time 0.5932 (0.5929) loss 6.6746 (7.1849) grad_norm 1.8077 (inf) loss_scale 256.0000 (346.6365) mem 22339MB +[2024-07-27 19:29:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][540/625] eta 0:00:50 lr 0.000206 wd 0.0500 time 0.5913 (0.5949) data time 0.0010 (0.0019) model time 0.5903 (0.5928) loss 6.9868 (7.1879) grad_norm 1.9438 (inf) loss_scale 256.0000 (344.9612) mem 22339MB +[2024-07-27 19:29:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][550/625] eta 0:00:44 lr 0.000206 wd 0.0500 time 0.5916 (0.5949) data time 0.0007 (0.0019) model time 0.5909 (0.5928) loss 6.5837 (7.1868) grad_norm 2.5178 (inf) loss_scale 256.0000 (343.3466) mem 22339MB +[2024-07-27 19:29:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][560/625] eta 0:00:38 lr 0.000206 wd 0.0500 time 0.5938 (0.5949) data time 0.0009 (0.0018) model time 0.5929 (0.5928) loss 7.6593 (7.1894) grad_norm 1.9641 (inf) loss_scale 256.0000 (341.7897) mem 22339MB +[2024-07-27 19:30:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][570/625] eta 0:00:32 lr 0.000206 wd 0.0500 time 0.6081 (0.5948) data time 0.0010 (0.0018) model time 0.6071 (0.5928) loss 7.4249 (7.1842) grad_norm 2.0758 (inf) loss_scale 256.0000 (340.2872) mem 22339MB +[2024-07-27 19:30:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][580/625] eta 0:00:26 lr 0.000206 wd 0.0500 time 0.5986 (0.5950) data time 0.0008 (0.0018) model time 0.5978 (0.5930) loss 6.0218 (7.1775) grad_norm 1.7458 (inf) loss_scale 256.0000 (338.8365) mem 22339MB +[2024-07-27 19:30:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][590/625] eta 0:00:20 lr 0.000206 wd 0.0500 time 0.5974 (0.5950) data time 0.0009 (0.0018) model time 0.5965 (0.5930) loss 7.2317 (7.1811) grad_norm 2.0689 (inf) loss_scale 256.0000 (337.4349) mem 22339MB +[2024-07-27 19:30:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][600/625] eta 0:00:14 lr 0.000205 wd 0.0500 time 0.5934 (0.5950) data time 0.0010 (0.0018) model time 0.5924 (0.5930) loss 7.6859 (7.1801) grad_norm 2.9179 (inf) loss_scale 256.0000 (336.0799) mem 22339MB +[2024-07-27 19:30:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][610/625] eta 0:00:08 lr 0.000205 wd 0.0500 time 0.5926 (0.5949) data time 0.0005 (0.0018) model time 0.5921 (0.5929) loss 6.5741 (7.1805) grad_norm 3.0280 (inf) loss_scale 256.0000 (334.7692) mem 22339MB +[2024-07-27 19:30:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [225/300][620/625] eta 0:00:02 lr 0.000205 wd 0.0500 time 0.5943 (0.5949) data time 0.0005 (0.0018) model time 0.5938 (0.5929) loss 7.2719 (7.1783) grad_norm 2.0055 (inf) loss_scale 256.0000 (333.5008) mem 22339MB +[2024-07-27 19:30:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 225 training takes 0:06:11 +[2024-07-27 19:30:34 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 19:30:35 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 19:30:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.513 (0.513) Loss 0.5020 (0.5020) Acc@1 90.430 (90.430) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-27 19:30:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.162) Loss 0.7568 (0.6129) Acc@1 82.471 (87.784) Acc@5 96.582 (98.042) Mem 22339MB +[2024-07-27 19:30:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.145) Loss 0.8423 (0.7028) Acc@1 80.078 (85.080) Acc@5 95.801 (97.177) Mem 22339MB +[2024-07-27 19:30:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.699 Acc@5 97.171 +[2024-07-27 19:30:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.7% +[2024-07-27 19:30:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 84.70% +[2024-07-27 19:30:39 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-27 19:30:40 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-27 19:30:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.510 (0.510) Loss 0.5063 (0.5063) Acc@1 90.479 (90.479) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-27 19:30:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.161) Loss 0.7471 (0.6175) Acc@1 83.301 (87.722) Acc@5 96.875 (98.069) Mem 22339MB +[2024-07-27 19:30:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.145) Loss 0.8564 (0.7056) Acc@1 79.443 (84.884) Acc@5 96.094 (97.252) Mem 22339MB +[2024-07-27 19:30:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.511 Acc@5 97.239 +[2024-07-27 19:30:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.5% +[2024-07-27 19:30:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.51% +[2024-07-27 19:30:44 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-27 19:30:46 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-27 19:30:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][0/625] eta 0:10:25 lr 0.000205 wd 0.0500 time 1.0003 (1.0003) data time 0.4622 (0.4622) model time 0.0000 (0.0000) loss 8.0924 (8.0924) grad_norm 2.0662 (2.0662) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:30:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][10/625] eta 0:06:26 lr 0.000205 wd 0.0500 time 0.5953 (0.6283) data time 0.0007 (0.0429) model time 0.0000 (0.0000) loss 6.5046 (6.9371) grad_norm 2.5453 (2.2263) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:30:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][20/625] eta 0:06:15 lr 0.000205 wd 0.0500 time 0.5896 (0.6203) data time 0.0008 (0.0229) model time 0.0000 (0.0000) loss 6.7815 (6.9782) grad_norm 1.8011 (2.2140) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:31:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][30/625] eta 0:06:03 lr 0.000205 wd 0.0500 time 0.5913 (0.6115) data time 0.0010 (0.0159) model time 0.0000 (0.0000) loss 7.3608 (7.1448) grad_norm 1.8888 (2.1791) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:31:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][40/625] eta 0:05:55 lr 0.000205 wd 0.0500 time 0.5736 (0.6070) data time 0.0011 (0.0122) model time 0.0000 (0.0000) loss 8.0863 (7.2733) grad_norm 2.4673 (2.2120) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:31:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][50/625] eta 0:05:47 lr 0.000205 wd 0.0500 time 0.5930 (0.6039) data time 0.0010 (0.0100) model time 0.0000 (0.0000) loss 5.8547 (7.1927) grad_norm 2.5961 (2.3386) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:31:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][60/625] eta 0:05:40 lr 0.000205 wd 0.0500 time 0.5871 (0.6022) data time 0.0008 (0.0086) model time 0.5863 (0.5922) loss 7.3836 (7.2142) grad_norm 2.3028 (2.4380) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:31:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][70/625] eta 0:05:33 lr 0.000205 wd 0.0500 time 0.5909 (0.6006) data time 0.0008 (0.0075) model time 0.5901 (0.5912) loss 7.4226 (7.2210) grad_norm 3.0968 (2.4828) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:31:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][80/625] eta 0:05:27 lr 0.000205 wd 0.0500 time 0.5898 (0.6010) data time 0.0010 (0.0067) model time 0.5888 (0.5949) loss 7.8378 (7.2274) grad_norm 2.5007 (2.6157) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:31:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][90/625] eta 0:05:21 lr 0.000205 wd 0.0500 time 0.5935 (0.6004) data time 0.0010 (0.0061) model time 0.5925 (0.5949) loss 7.5270 (7.2412) grad_norm 2.0531 (2.6086) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:31:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][100/625] eta 0:05:15 lr 0.000204 wd 0.0500 time 0.5935 (0.6000) data time 0.0010 (0.0056) model time 0.5926 (0.5949) loss 6.7858 (7.2224) grad_norm 2.1471 (2.6406) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:31:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][110/625] eta 0:05:08 lr 0.000204 wd 0.0500 time 0.5870 (0.5996) data time 0.0007 (0.0052) model time 0.5863 (0.5948) loss 6.0303 (7.1776) grad_norm 3.8571 (2.6558) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:31:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][120/625] eta 0:05:02 lr 0.000204 wd 0.0500 time 0.5876 (0.5989) data time 0.0009 (0.0049) model time 0.5867 (0.5942) loss 6.6956 (7.1866) grad_norm 2.0388 (2.6094) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:32:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][130/625] eta 0:04:56 lr 0.000204 wd 0.0500 time 0.5820 (0.5983) data time 0.0007 (0.0046) model time 0.5813 (0.5937) loss 7.1172 (7.1679) grad_norm 2.1436 (2.5909) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:32:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][140/625] eta 0:04:49 lr 0.000204 wd 0.0500 time 0.5862 (0.5978) data time 0.0008 (0.0043) model time 0.5854 (0.5933) loss 8.0699 (7.1596) grad_norm 1.8402 (2.5666) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:32:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][150/625] eta 0:04:43 lr 0.000204 wd 0.0500 time 0.5894 (0.5973) data time 0.0008 (0.0041) model time 0.5886 (0.5929) loss 7.0254 (7.1420) grad_norm 2.2675 (2.5622) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:32:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][160/625] eta 0:04:37 lr 0.000204 wd 0.0500 time 0.5932 (0.5971) data time 0.0010 (0.0039) model time 0.5922 (0.5929) loss 6.3302 (7.1528) grad_norm 2.4989 (2.5893) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:32:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][170/625] eta 0:04:31 lr 0.000204 wd 0.0500 time 0.5917 (0.5970) data time 0.0007 (0.0037) model time 0.5910 (0.5931) loss 7.5338 (7.1565) grad_norm 2.0848 (2.5817) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:32:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][180/625] eta 0:04:25 lr 0.000204 wd 0.0500 time 0.5906 (0.5969) data time 0.0009 (0.0036) model time 0.5897 (0.5931) loss 8.2918 (7.1445) grad_norm 1.9935 (2.6234) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:32:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][190/625] eta 0:04:19 lr 0.000204 wd 0.0500 time 0.5911 (0.5967) data time 0.0008 (0.0034) model time 0.5903 (0.5930) loss 6.8537 (7.1310) grad_norm 2.3668 (2.6315) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:32:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][200/625] eta 0:04:13 lr 0.000204 wd 0.0500 time 0.5904 (0.5963) data time 0.0010 (0.0033) model time 0.5894 (0.5927) loss 8.2252 (7.1479) grad_norm 2.7569 (2.6332) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:32:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][210/625] eta 0:04:07 lr 0.000204 wd 0.0500 time 0.5877 (0.5961) data time 0.0008 (0.0032) model time 0.5869 (0.5926) loss 6.4092 (7.1395) grad_norm 2.7969 (2.6416) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:32:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][220/625] eta 0:04:01 lr 0.000204 wd 0.0500 time 0.5854 (0.5958) data time 0.0010 (0.0031) model time 0.5844 (0.5923) loss 6.8118 (7.1522) grad_norm 2.0452 (2.6169) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:33:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][230/625] eta 0:03:55 lr 0.000203 wd 0.0500 time 0.5902 (0.5956) data time 0.0008 (0.0030) model time 0.5894 (0.5922) loss 6.9928 (7.1555) grad_norm 2.3741 (2.6126) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:33:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][240/625] eta 0:03:49 lr 0.000203 wd 0.0500 time 0.5912 (0.5961) data time 0.0010 (0.0029) model time 0.5902 (0.5930) loss 7.0064 (7.1404) grad_norm 1.8982 (2.5884) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:33:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][250/625] eta 0:03:43 lr 0.000203 wd 0.0500 time 0.5823 (0.5960) data time 0.0008 (0.0029) model time 0.5814 (0.5930) loss 7.1825 (7.1224) grad_norm 2.2008 (2.5861) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:33:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][260/625] eta 0:03:37 lr 0.000203 wd 0.0500 time 0.5883 (0.5958) data time 0.0010 (0.0028) model time 0.5873 (0.5928) loss 7.3835 (7.1252) grad_norm 2.4337 (2.5849) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:33:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][270/625] eta 0:03:31 lr 0.000203 wd 0.0500 time 0.5865 (0.5957) data time 0.0008 (0.0027) model time 0.5857 (0.5927) loss 7.0108 (7.1355) grad_norm 2.0586 (2.5628) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:33:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][280/625] eta 0:03:25 lr 0.000203 wd 0.0500 time 0.5880 (0.5955) data time 0.0008 (0.0027) model time 0.5872 (0.5926) loss 6.1326 (7.1344) grad_norm 4.5605 (2.5941) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:33:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][290/625] eta 0:03:19 lr 0.000203 wd 0.0500 time 0.5407 (0.5959) data time 0.0009 (0.0026) model time 0.5398 (0.5932) loss 6.4295 (7.1561) grad_norm 2.1691 (2.5757) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:33:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][300/625] eta 0:03:13 lr 0.000203 wd 0.0500 time 0.5976 (0.5958) data time 0.0008 (0.0026) model time 0.5967 (0.5930) loss 6.7931 (7.1598) grad_norm 2.6871 (2.7052) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:33:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][310/625] eta 0:03:07 lr 0.000203 wd 0.0500 time 0.5954 (0.5957) data time 0.0010 (0.0025) model time 0.5944 (0.5930) loss 7.8153 (7.1715) grad_norm 1.6958 (2.7095) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:33:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][320/625] eta 0:03:01 lr 0.000203 wd 0.0500 time 0.5980 (0.5956) data time 0.0009 (0.0025) model time 0.5972 (0.5930) loss 6.1154 (7.1636) grad_norm 1.9370 (2.6941) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:34:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][330/625] eta 0:02:55 lr 0.000203 wd 0.0500 time 0.5911 (0.5956) data time 0.0008 (0.0024) model time 0.5903 (0.5930) loss 7.3416 (7.1575) grad_norm 2.0681 (2.6863) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:34:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][340/625] eta 0:02:49 lr 0.000203 wd 0.0500 time 0.5824 (0.5955) data time 0.0009 (0.0024) model time 0.5816 (0.5930) loss 7.0746 (7.1674) grad_norm 2.3765 (2.6746) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:34:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][350/625] eta 0:02:43 lr 0.000202 wd 0.0500 time 0.5867 (0.5955) data time 0.0011 (0.0024) model time 0.5856 (0.5930) loss 7.5000 (7.1685) grad_norm 2.2046 (2.6603) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:34:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][360/625] eta 0:02:37 lr 0.000202 wd 0.0500 time 0.5894 (0.5953) data time 0.0009 (0.0023) model time 0.5885 (0.5929) loss 7.7240 (7.1692) grad_norm 1.8740 (2.6394) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:34:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][370/625] eta 0:02:31 lr 0.000202 wd 0.0500 time 0.5893 (0.5952) data time 0.0011 (0.0023) model time 0.5882 (0.5928) loss 6.5265 (7.1631) grad_norm 1.9573 (2.6304) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:34:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][380/625] eta 0:02:25 lr 0.000202 wd 0.0500 time 0.5805 (0.5953) data time 0.0010 (0.0023) model time 0.5795 (0.5929) loss 7.5984 (7.1739) grad_norm 1.8624 (2.6185) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:34:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][390/625] eta 0:02:19 lr 0.000202 wd 0.0500 time 0.5928 (0.5953) data time 0.0009 (0.0023) model time 0.5919 (0.5929) loss 7.8021 (7.1693) grad_norm 2.8021 (2.6064) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:34:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][400/625] eta 0:02:13 lr 0.000202 wd 0.0500 time 0.5760 (0.5953) data time 0.0010 (0.0022) model time 0.5750 (0.5929) loss 6.8081 (7.1634) grad_norm 2.1432 (2.5956) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:34:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][410/625] eta 0:02:07 lr 0.000202 wd 0.0500 time 0.5934 (0.5953) data time 0.0008 (0.0022) model time 0.5926 (0.5929) loss 6.3617 (7.1673) grad_norm 2.0042 (2.5841) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:34:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][420/625] eta 0:02:02 lr 0.000202 wd 0.0500 time 0.5912 (0.5952) data time 0.0011 (0.0022) model time 0.5901 (0.5929) loss 7.1038 (7.1772) grad_norm 1.9485 (2.5779) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:35:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][430/625] eta 0:01:56 lr 0.000202 wd 0.0500 time 0.5894 (0.5951) data time 0.0011 (0.0022) model time 0.5883 (0.5928) loss 6.2393 (7.1771) grad_norm 3.1874 (2.5832) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:35:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][440/625] eta 0:01:50 lr 0.000202 wd 0.0500 time 0.5900 (0.5952) data time 0.0008 (0.0021) model time 0.5892 (0.5930) loss 7.4267 (7.1795) grad_norm 2.4695 (2.5981) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:35:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][450/625] eta 0:01:44 lr 0.000202 wd 0.0500 time 0.5917 (0.5951) data time 0.0007 (0.0021) model time 0.5910 (0.5929) loss 7.5853 (7.1801) grad_norm 2.6943 (2.5970) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:35:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][460/625] eta 0:01:38 lr 0.000202 wd 0.0500 time 0.5930 (0.5955) data time 0.0010 (0.0021) model time 0.5920 (0.5934) loss 6.6033 (7.1727) grad_norm 2.2526 (2.5899) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:35:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][470/625] eta 0:01:32 lr 0.000202 wd 0.0500 time 0.5932 (0.5955) data time 0.0007 (0.0021) model time 0.5925 (0.5934) loss 7.3855 (7.1771) grad_norm 3.2938 (2.5799) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:35:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][480/625] eta 0:01:26 lr 0.000201 wd 0.0500 time 0.5899 (0.5955) data time 0.0007 (0.0020) model time 0.5891 (0.5934) loss 8.0195 (7.1922) grad_norm 2.9367 (2.6124) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:35:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-27 19:35:33 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 19:35:34 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 19:46:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 19:46:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 19:47:11 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 19:47:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-27 19:47:24 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-27 19:47:24 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-27 19:47:24 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-27 19:47:25 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 226) +[2024-07-27 19:47:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-27 19:47:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][490/625] eta 0:04:02 lr 0.000201 wd 0.0500 time 0.5994 (1.7972) data time 0.0008 (0.0887) model time 0.5987 (1.7084) loss 7.3875 (7.6914) grad_norm 2.7503 (3.1599) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 19:47:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][500/625] eta 0:02:25 lr 0.000201 wd 0.0500 time 0.5985 (1.1669) data time 0.0010 (0.0426) model time 0.5975 (1.1243) loss 6.4240 (7.3918) grad_norm 2.5819 (2.9602) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 19:47:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][510/625] eta 0:01:51 lr 0.000201 wd 0.0500 time 0.5977 (0.9707) data time 0.0007 (0.0282) model time 0.5970 (0.9425) loss 7.1840 (7.3997) grad_norm 3.7737 (3.1975) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 19:48:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][520/625] eta 0:01:31 lr 0.000201 wd 0.0500 time 0.5916 (0.8750) data time 0.0009 (0.0212) model time 0.5907 (0.8537) loss 6.7186 (7.4047) grad_norm 2.0993 (3.1343) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 19:48:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][530/625] eta 0:01:17 lr 0.000201 wd 0.0500 time 0.6009 (0.8183) data time 0.0010 (0.0171) model time 0.5999 (0.8012) loss 7.5578 (7.3458) grad_norm 2.0764 (2.9402) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 19:48:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][540/625] eta 0:01:07 lr 0.000201 wd 0.0500 time 0.6029 (0.7883) data time 0.0007 (0.0144) model time 0.6021 (0.7739) loss 6.6227 (7.2677) grad_norm 2.5577 (2.8751) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 19:48:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][550/625] eta 0:00:57 lr 0.000201 wd 0.0500 time 0.6027 (0.7614) data time 0.0010 (0.0124) model time 0.6017 (0.7490) loss 8.0628 (7.2494) grad_norm 2.6659 (2.7964) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 19:48:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][560/625] eta 0:00:48 lr 0.000201 wd 0.0500 time 0.6054 (0.7415) data time 0.0009 (0.0110) model time 0.6044 (0.7305) loss 6.9287 (7.2491) grad_norm 3.3618 (2.8332) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 19:48:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-27 19:48:28 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 19:48:31 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 19:50:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 19:50:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 19:50:48 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 19:50:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-27 19:50:57 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-27 19:50:58 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-27 19:50:58 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-27 19:50:58 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 226) +[2024-07-27 19:50:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-27 19:51:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][570/625] eta 0:01:40 lr 0.000201 wd 0.0500 time 0.5734 (1.8323) data time 0.0009 (0.0482) model time 0.5725 (1.7841) loss 8.2017 (7.6634) grad_norm 1.7774 (2.4990) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 19:51:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][580/625] eta 0:00:54 lr 0.000201 wd 0.0500 time 0.5684 (1.2034) data time 0.0007 (0.0245) model time 0.5677 (1.1789) loss 7.4778 (7.5343) grad_norm 1.8146 (2.6393) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 19:51:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][590/625] eta 0:00:34 lr 0.000201 wd 0.0500 time 0.5755 (0.9927) data time 0.0009 (0.0167) model time 0.5746 (0.9761) loss 8.4823 (7.6062) grad_norm 1.8240 (2.4808) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 19:51:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][600/625] eta 0:00:22 lr 0.000201 wd 0.0500 time 0.6017 (0.8880) data time 0.0007 (0.0127) model time 0.6010 (0.8753) loss 6.4005 (7.4034) grad_norm 1.6877 (2.4909) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 19:51:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][610/625] eta 0:00:12 lr 0.000200 wd 0.0500 time 0.5704 (0.8246) data time 0.0006 (0.0105) model time 0.5698 (0.8141) loss 6.2855 (7.3527) grad_norm 1.7312 (2.4455) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 19:51:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [226/300][620/625] eta 0:00:03 lr 0.000200 wd 0.0500 time 0.5726 (0.7885) data time 0.0004 (0.0089) model time 0.5721 (0.7796) loss 7.1368 (7.3158) grad_norm 4.2376 (2.4644) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 19:51:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 226 training takes 0:00:49 +[2024-07-27 19:51:52 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 19:51:59 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 19:51:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.480 (0.480) Loss 0.5039 (0.5039) Acc@1 90.332 (90.332) Acc@5 98.975 (98.975) Mem 22341MB +[2024-07-27 19:52:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.7617 (0.6159) Acc@1 82.812 (87.740) Acc@5 97.119 (98.105) Mem 22341MB +[2024-07-27 19:52:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.124 (0.141) Loss 0.8584 (0.7061) Acc@1 79.639 (84.996) Acc@5 96.045 (97.245) Mem 22341MB +[2024-07-27 19:52:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.605 Acc@5 97.247 +[2024-07-27 19:52:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.6% +[2024-07-27 19:52:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.816 (0.816) Loss 0.5063 (0.5063) Acc@1 90.479 (90.479) Acc@5 98.926 (98.926) Mem 22341MB +[2024-07-27 19:52:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.124 (0.187) Loss 0.7466 (0.6171) Acc@1 83.301 (87.749) Acc@5 96.826 (98.069) Mem 22341MB +[2024-07-27 19:52:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.124 (0.157) Loss 0.8564 (0.7051) Acc@1 79.492 (84.905) Acc@5 96.143 (97.259) Mem 22341MB +[2024-07-27 19:52:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.533 Acc@5 97.245 +[2024-07-27 19:52:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.5% +[2024-07-27 19:52:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.53% +[2024-07-27 19:52:09 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-27 19:52:14 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-27 19:52:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][0/625] eta 0:13:09 lr 0.000200 wd 0.0500 time 1.2637 (1.2637) data time 0.3915 (0.3915) model time 0.0000 (0.0000) loss 7.4695 (7.4695) grad_norm 2.2447 (2.2447) loss_scale 256.0000 (256.0000) mem 22337MB +[2024-07-27 19:52:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][10/625] eta 0:06:29 lr 0.000200 wd 0.0500 time 0.5740 (0.6340) data time 0.0008 (0.0364) model time 0.0000 (0.0000) loss 6.5718 (6.8663) grad_norm 2.1492 (2.1131) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:52:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][20/625] eta 0:06:05 lr 0.000200 wd 0.0500 time 0.5735 (0.6043) data time 0.0007 (0.0195) model time 0.0000 (0.0000) loss 7.3862 (7.0096) grad_norm 4.2664 (2.4980) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:52:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][30/625] eta 0:05:53 lr 0.000200 wd 0.0500 time 0.5705 (0.5934) data time 0.0008 (0.0135) model time 0.0000 (0.0000) loss 7.8949 (7.0702) grad_norm 1.5825 (2.5124) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:52:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][40/625] eta 0:05:44 lr 0.000200 wd 0.0500 time 0.5717 (0.5880) data time 0.0008 (0.0104) model time 0.0000 (0.0000) loss 7.6775 (7.2253) grad_norm 2.4600 (2.4872) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:52:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][50/625] eta 0:05:36 lr 0.000200 wd 0.0500 time 0.5718 (0.5847) data time 0.0006 (0.0085) model time 0.0000 (0.0000) loss 6.1361 (7.1962) grad_norm 1.9959 (2.3979) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:52:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][60/625] eta 0:05:29 lr 0.000200 wd 0.0500 time 0.5749 (0.5827) data time 0.0006 (0.0073) model time 0.5743 (0.5720) loss 6.9859 (7.2429) grad_norm 2.4281 (2.4016) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:52:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][70/625] eta 0:05:22 lr 0.000200 wd 0.0500 time 0.5757 (0.5815) data time 0.0006 (0.0064) model time 0.5751 (0.5728) loss 6.8621 (7.2338) grad_norm 2.3082 (2.4032) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:53:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][80/625] eta 0:05:16 lr 0.000200 wd 0.0500 time 0.5764 (0.5807) data time 0.0008 (0.0057) model time 0.5755 (0.5731) loss 7.9576 (7.2075) grad_norm 2.2103 (2.4094) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:53:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][90/625] eta 0:05:10 lr 0.000200 wd 0.0500 time 0.5736 (0.5800) data time 0.0008 (0.0051) model time 0.5728 (0.5731) loss 7.9470 (7.1989) grad_norm 2.4111 (2.4314) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:53:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][100/625] eta 0:05:04 lr 0.000200 wd 0.0500 time 0.5744 (0.5793) data time 0.0008 (0.0047) model time 0.5735 (0.5731) loss 8.5533 (7.2181) grad_norm 3.8036 (2.4253) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:53:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][110/625] eta 0:04:58 lr 0.000199 wd 0.0500 time 0.5737 (0.5788) data time 0.0007 (0.0044) model time 0.5730 (0.5730) loss 6.5404 (7.2003) grad_norm 2.0874 (2.4099) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:53:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][120/625] eta 0:04:52 lr 0.000199 wd 0.0500 time 0.5724 (0.5784) data time 0.0009 (0.0041) model time 0.5715 (0.5730) loss 7.3789 (7.2021) grad_norm 1.9258 (2.3895) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:53:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][130/625] eta 0:04:46 lr 0.000199 wd 0.0500 time 0.5769 (0.5781) data time 0.0009 (0.0038) model time 0.5760 (0.5730) loss 6.4895 (7.1846) grad_norm 2.4083 (2.3686) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:53:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][140/625] eta 0:04:40 lr 0.000199 wd 0.0500 time 0.5750 (0.5779) data time 0.0009 (0.0036) model time 0.5741 (0.5732) loss 6.4852 (7.1540) grad_norm 2.0170 (2.3614) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:53:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][150/625] eta 0:04:34 lr 0.000199 wd 0.0500 time 0.5776 (0.5787) data time 0.0008 (0.0034) model time 0.5767 (0.5749) loss 6.5112 (7.1468) grad_norm 3.2974 (2.3490) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:53:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][160/625] eta 0:04:29 lr 0.000199 wd 0.0500 time 0.5758 (0.5786) data time 0.0006 (0.0033) model time 0.5751 (0.5749) loss 6.6803 (7.1338) grad_norm 2.3420 (2.3761) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:53:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][170/625] eta 0:04:23 lr 0.000199 wd 0.0500 time 0.5750 (0.5784) data time 0.0009 (0.0031) model time 0.5742 (0.5748) loss 8.3811 (7.1312) grad_norm 2.5939 (2.3876) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:53:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][180/625] eta 0:04:17 lr 0.000199 wd 0.0500 time 0.5748 (0.5782) data time 0.0008 (0.0030) model time 0.5740 (0.5748) loss 7.4948 (7.1406) grad_norm 2.6351 (2.3828) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:54:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][190/625] eta 0:04:11 lr 0.000199 wd 0.0500 time 0.5760 (0.5780) data time 0.0007 (0.0029) model time 0.5753 (0.5747) loss 6.4808 (7.1161) grad_norm 3.3741 (2.3898) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:54:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][200/625] eta 0:04:05 lr 0.000199 wd 0.0500 time 0.5761 (0.5778) data time 0.0007 (0.0028) model time 0.5754 (0.5746) loss 6.2077 (7.0966) grad_norm 2.4220 (2.3936) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:54:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][210/625] eta 0:04:00 lr 0.000199 wd 0.0500 time 0.5765 (0.5787) data time 0.0009 (0.0027) model time 0.5756 (0.5759) loss 7.0435 (7.1039) grad_norm 3.6266 (2.4148) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:54:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][220/625] eta 0:03:54 lr 0.000199 wd 0.0500 time 0.5740 (0.5785) data time 0.0008 (0.0026) model time 0.5732 (0.5758) loss 8.1803 (7.1062) grad_norm 1.8800 (2.4070) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:54:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][230/625] eta 0:03:48 lr 0.000199 wd 0.0500 time 0.5757 (0.5784) data time 0.0008 (0.0026) model time 0.5749 (0.5758) loss 7.1552 (7.0954) grad_norm 2.1007 (2.4194) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:54:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][240/625] eta 0:03:42 lr 0.000198 wd 0.0500 time 0.5766 (0.5783) data time 0.0009 (0.0025) model time 0.5757 (0.5758) loss 6.7102 (7.0903) grad_norm 2.7918 (2.4353) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:54:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][250/625] eta 0:03:36 lr 0.000198 wd 0.0500 time 0.5760 (0.5782) data time 0.0008 (0.0024) model time 0.5752 (0.5757) loss 7.8534 (7.1147) grad_norm 2.1876 (2.4365) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:54:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][260/625] eta 0:03:31 lr 0.000198 wd 0.0500 time 0.5776 (0.5781) data time 0.0006 (0.0024) model time 0.5770 (0.5757) loss 6.6760 (7.1344) grad_norm 2.5990 (2.4768) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:54:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][270/625] eta 0:03:25 lr 0.000198 wd 0.0500 time 0.5760 (0.5780) data time 0.0007 (0.0023) model time 0.5754 (0.5756) loss 6.8964 (7.1331) grad_norm 2.1098 (2.4746) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:54:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][280/625] eta 0:03:19 lr 0.000198 wd 0.0500 time 0.5812 (0.5779) data time 0.0007 (0.0022) model time 0.5805 (0.5755) loss 7.0075 (7.1455) grad_norm 3.8328 (2.4790) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:55:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][290/625] eta 0:03:13 lr 0.000198 wd 0.0500 time 0.5830 (0.5778) data time 0.0009 (0.0022) model time 0.5822 (0.5755) loss 7.3692 (7.1554) grad_norm 3.6568 (2.4756) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:55:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][300/625] eta 0:03:07 lr 0.000198 wd 0.0500 time 0.5754 (0.5778) data time 0.0008 (0.0022) model time 0.5747 (0.5755) loss 7.5542 (7.1491) grad_norm 2.0322 (2.4956) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:55:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][310/625] eta 0:03:01 lr 0.000198 wd 0.0500 time 0.5738 (0.5777) data time 0.0007 (0.0021) model time 0.5731 (0.5754) loss 5.9183 (7.1498) grad_norm 1.7898 (2.5009) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:55:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][320/625] eta 0:02:56 lr 0.000198 wd 0.0500 time 0.5732 (0.5776) data time 0.0006 (0.0021) model time 0.5726 (0.5754) loss 5.6004 (7.1312) grad_norm 1.8446 (2.5142) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:55:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][330/625] eta 0:02:50 lr 0.000198 wd 0.0500 time 0.5841 (0.5776) data time 0.0008 (0.0020) model time 0.5832 (0.5754) loss 7.4244 (7.1401) grad_norm 1.7076 (2.5160) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:55:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][340/625] eta 0:02:44 lr 0.000198 wd 0.0500 time 0.5750 (0.5775) data time 0.0006 (0.0020) model time 0.5744 (0.5754) loss 6.8992 (7.1470) grad_norm 2.4880 (2.5331) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:55:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][350/625] eta 0:02:38 lr 0.000198 wd 0.0500 time 0.5777 (0.5774) data time 0.0009 (0.0020) model time 0.5769 (0.5753) loss 7.5726 (7.1512) grad_norm 2.1415 (2.5280) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:55:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][360/625] eta 0:02:32 lr 0.000198 wd 0.0500 time 0.5747 (0.5774) data time 0.0007 (0.0019) model time 0.5740 (0.5753) loss 7.2061 (7.1460) grad_norm 3.0575 (2.5252) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:55:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][370/625] eta 0:02:27 lr 0.000197 wd 0.0500 time 0.5729 (0.5776) data time 0.0006 (0.0019) model time 0.5723 (0.5756) loss 6.3888 (7.1566) grad_norm 1.9132 (2.5431) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:55:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][380/625] eta 0:02:21 lr 0.000197 wd 0.0500 time 0.5743 (0.5775) data time 0.0009 (0.0019) model time 0.5735 (0.5756) loss 8.2918 (7.1537) grad_norm 2.8521 (2.5376) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:56:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][390/625] eta 0:02:15 lr 0.000197 wd 0.0500 time 0.5732 (0.5774) data time 0.0008 (0.0019) model time 0.5724 (0.5755) loss 7.1768 (7.1534) grad_norm 3.2586 (2.5333) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:56:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][400/625] eta 0:02:09 lr 0.000197 wd 0.0500 time 0.5782 (0.5774) data time 0.0006 (0.0018) model time 0.5775 (0.5755) loss 5.8407 (7.1452) grad_norm 2.9284 (2.5309) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:56:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][410/625] eta 0:02:04 lr 0.000197 wd 0.0500 time 0.5719 (0.5773) data time 0.0007 (0.0018) model time 0.5711 (0.5754) loss 6.6149 (7.1367) grad_norm 2.7209 (2.5323) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:56:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][420/625] eta 0:01:58 lr 0.000197 wd 0.0500 time 0.5733 (0.5772) data time 0.0007 (0.0018) model time 0.5727 (0.5753) loss 7.9717 (7.1359) grad_norm 2.0150 (2.5261) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:56:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][430/625] eta 0:01:52 lr 0.000197 wd 0.0500 time 0.5771 (0.5779) data time 0.0006 (0.0018) model time 0.5765 (0.5762) loss 6.8992 (7.1354) grad_norm 2.5083 (2.5651) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:56:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][440/625] eta 0:01:46 lr 0.000197 wd 0.0500 time 0.5781 (0.5779) data time 0.0010 (0.0017) model time 0.5771 (0.5762) loss 7.6605 (7.1340) grad_norm 2.3709 (2.5803) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:56:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][450/625] eta 0:01:41 lr 0.000197 wd 0.0500 time 0.5736 (0.5779) data time 0.0008 (0.0017) model time 0.5728 (0.5761) loss 6.2366 (7.1432) grad_norm 3.4272 (2.6239) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:56:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][460/625] eta 0:01:35 lr 0.000197 wd 0.0500 time 0.5763 (0.5778) data time 0.0007 (0.0017) model time 0.5756 (0.5761) loss 7.9784 (7.1403) grad_norm 3.0839 (2.6370) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:56:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][470/625] eta 0:01:29 lr 0.000197 wd 0.0500 time 0.5717 (0.5777) data time 0.0006 (0.0017) model time 0.5711 (0.5760) loss 5.8654 (7.1372) grad_norm 4.9757 (2.6402) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:56:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][480/625] eta 0:01:23 lr 0.000197 wd 0.0500 time 0.5728 (0.5776) data time 0.0007 (0.0017) model time 0.5721 (0.5759) loss 6.6676 (7.1409) grad_norm 1.5852 (2.6308) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:56:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][490/625] eta 0:01:17 lr 0.000197 wd 0.0500 time 0.5741 (0.5776) data time 0.0009 (0.0017) model time 0.5732 (0.5759) loss 7.2634 (7.1406) grad_norm 1.9852 (2.6258) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:57:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][500/625] eta 0:01:12 lr 0.000196 wd 0.0500 time 0.5895 (0.5776) data time 0.0009 (0.0016) model time 0.5887 (0.5759) loss 7.3536 (7.1487) grad_norm 3.2771 (2.6480) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:57:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][510/625] eta 0:01:06 lr 0.000196 wd 0.0500 time 0.5767 (0.5775) data time 0.0008 (0.0016) model time 0.5759 (0.5759) loss 7.2854 (7.1481) grad_norm 2.6249 (2.6400) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:57:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][520/625] eta 0:01:00 lr 0.000196 wd 0.0500 time 0.5775 (0.5775) data time 0.0008 (0.0016) model time 0.5766 (0.5759) loss 7.4179 (7.1521) grad_norm 8.0031 (2.6511) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:57:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][530/625] eta 0:00:54 lr 0.000196 wd 0.0500 time 0.5770 (0.5775) data time 0.0007 (0.0016) model time 0.5764 (0.5759) loss 7.4450 (7.1590) grad_norm 3.1642 (2.6627) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:57:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][540/625] eta 0:00:49 lr 0.000196 wd 0.0500 time 0.5742 (0.5774) data time 0.0007 (0.0016) model time 0.5735 (0.5758) loss 8.0345 (7.1564) grad_norm 3.1996 (2.6657) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:57:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][550/625] eta 0:00:43 lr 0.000196 wd 0.0500 time 0.5725 (0.5773) data time 0.0008 (0.0016) model time 0.5717 (0.5757) loss 7.1129 (7.1521) grad_norm 4.0513 (2.6631) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:57:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][560/625] eta 0:00:37 lr 0.000196 wd 0.0500 time 0.5710 (0.5773) data time 0.0008 (0.0016) model time 0.5702 (0.5757) loss 7.8739 (7.1576) grad_norm 2.0977 (2.6580) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:57:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][570/625] eta 0:00:31 lr 0.000196 wd 0.0500 time 0.5771 (0.5773) data time 0.0009 (0.0015) model time 0.5762 (0.5757) loss 6.5645 (7.1632) grad_norm 2.6270 (2.6741) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:57:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][580/625] eta 0:00:25 lr 0.000196 wd 0.0500 time 0.6141 (0.5774) data time 0.0006 (0.0015) model time 0.6134 (0.5759) loss 6.0907 (7.1577) grad_norm 3.3348 (2.6719) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:57:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][590/625] eta 0:00:20 lr 0.000196 wd 0.0500 time 0.5754 (0.5777) data time 0.0008 (0.0015) model time 0.5747 (0.5762) loss 6.5760 (7.1587) grad_norm 55.6848 (2.7590) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:58:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][600/625] eta 0:00:14 lr 0.000196 wd 0.0500 time 0.5776 (0.5777) data time 0.0007 (0.0015) model time 0.5769 (0.5761) loss 6.7381 (7.1542) grad_norm 3.0551 (2.7629) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:58:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][610/625] eta 0:00:08 lr 0.000196 wd 0.0500 time 0.5729 (0.5776) data time 0.0004 (0.0015) model time 0.5725 (0.5761) loss 7.5858 (7.1613) grad_norm 1.8781 (2.7566) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:58:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [227/300][620/625] eta 0:00:02 lr 0.000196 wd 0.0500 time 0.5684 (0.5776) data time 0.0006 (0.0015) model time 0.5677 (0.5761) loss 7.8161 (7.1583) grad_norm 2.9677 (2.7593) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:58:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 227 training takes 0:06:00 +[2024-07-27 19:58:15 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 19:58:17 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 19:58:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.488 (0.488) Loss 0.5054 (0.5054) Acc@1 90.039 (90.039) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-27 19:58:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.159) Loss 0.7656 (0.6196) Acc@1 82.080 (87.695) Acc@5 96.924 (98.082) Mem 22339MB +[2024-07-27 19:58:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.144) Loss 0.8491 (0.7075) Acc@1 80.176 (84.956) Acc@5 96.240 (97.282) Mem 22339MB +[2024-07-27 19:58:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.499 Acc@5 97.259 +[2024-07-27 19:58:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.5% +[2024-07-27 19:58:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.824 (0.824) Loss 0.5063 (0.5063) Acc@1 90.479 (90.479) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-27 19:58:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.189) Loss 0.7471 (0.6170) Acc@1 83.301 (87.749) Acc@5 96.826 (98.069) Mem 22339MB +[2024-07-27 19:58:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.159) Loss 0.8545 (0.7047) Acc@1 79.492 (84.903) Acc@5 96.094 (97.256) Mem 22339MB +[2024-07-27 19:58:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.519 Acc@5 97.243 +[2024-07-27 19:58:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.5% +[2024-07-27 19:58:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][0/625] eta 0:13:48 lr 0.000196 wd 0.0500 time 1.3261 (1.3261) data time 0.5532 (0.5532) model time 0.0000 (0.0000) loss 6.4081 (6.4081) grad_norm 2.1374 (2.1374) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:58:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][10/625] eta 0:06:35 lr 0.000195 wd 0.0500 time 0.5760 (0.6433) data time 0.0007 (0.0511) model time 0.0000 (0.0000) loss 5.9678 (6.8088) grad_norm 3.3114 (2.5395) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:58:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][20/625] eta 0:06:09 lr 0.000195 wd 0.0500 time 0.5754 (0.6107) data time 0.0010 (0.0271) model time 0.0000 (0.0000) loss 8.0805 (6.9565) grad_norm 1.5609 (2.6104) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:58:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][30/625] eta 0:06:02 lr 0.000195 wd 0.0500 time 0.5802 (0.6100) data time 0.0009 (0.0186) model time 0.0000 (0.0000) loss 6.2777 (6.9549) grad_norm 2.7653 (2.6094) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:58:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][40/625] eta 0:05:51 lr 0.000195 wd 0.0500 time 0.5798 (0.6016) data time 0.0008 (0.0143) model time 0.0000 (0.0000) loss 8.8010 (7.0261) grad_norm 1.7828 (2.6574) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:58:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][50/625] eta 0:05:43 lr 0.000195 wd 0.0500 time 0.5769 (0.5969) data time 0.0009 (0.0120) model time 0.0000 (0.0000) loss 8.1442 (7.1346) grad_norm 2.3893 (2.5571) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:59:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][60/625] eta 0:05:35 lr 0.000195 wd 0.0500 time 0.5757 (0.5932) data time 0.0009 (0.0101) model time 0.5749 (0.5733) loss 6.4898 (7.1180) grad_norm 1.8983 (2.5151) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:59:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][70/625] eta 0:05:27 lr 0.000195 wd 0.0500 time 0.5739 (0.5905) data time 0.0009 (0.0088) model time 0.5729 (0.5733) loss 8.5620 (7.1442) grad_norm 1.6894 (2.4561) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:59:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][80/625] eta 0:05:20 lr 0.000195 wd 0.0500 time 0.5725 (0.5885) data time 0.0008 (0.0078) model time 0.5717 (0.5733) loss 8.7672 (7.1858) grad_norm 2.2216 (2.4385) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:59:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][90/625] eta 0:05:14 lr 0.000195 wd 0.0500 time 0.5767 (0.5871) data time 0.0009 (0.0071) model time 0.5758 (0.5735) loss 8.1844 (7.1779) grad_norm 2.6320 (2.4348) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:59:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][100/625] eta 0:05:07 lr 0.000195 wd 0.0500 time 0.5746 (0.5860) data time 0.0010 (0.0065) model time 0.5736 (0.5740) loss 7.5110 (7.1531) grad_norm 1.9140 (2.4252) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:59:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][110/625] eta 0:05:01 lr 0.000195 wd 0.0500 time 0.5790 (0.5850) data time 0.0009 (0.0060) model time 0.5782 (0.5740) loss 7.4719 (7.1338) grad_norm 3.6413 (2.4356) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:59:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][120/625] eta 0:04:55 lr 0.000195 wd 0.0500 time 0.5738 (0.5842) data time 0.0006 (0.0056) model time 0.5732 (0.5741) loss 6.0004 (7.0981) grad_norm 1.8176 (2.4171) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:59:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][130/625] eta 0:04:48 lr 0.000195 wd 0.0500 time 0.5751 (0.5835) data time 0.0007 (0.0052) model time 0.5745 (0.5740) loss 6.0161 (7.1175) grad_norm 2.2126 (2.4139) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:59:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][140/625] eta 0:04:42 lr 0.000194 wd 0.0500 time 0.5735 (0.5828) data time 0.0010 (0.0049) model time 0.5725 (0.5739) loss 8.2872 (7.1195) grad_norm 2.6342 (2.4185) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:59:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][150/625] eta 0:04:36 lr 0.000194 wd 0.0500 time 0.5786 (0.5830) data time 0.0007 (0.0046) model time 0.5779 (0.5749) loss 6.2125 (7.0995) grad_norm 3.1182 (2.4332) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 19:59:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][160/625] eta 0:04:30 lr 0.000194 wd 0.0500 time 0.5775 (0.5825) data time 0.0006 (0.0044) model time 0.5769 (0.5749) loss 7.3172 (7.0823) grad_norm 2.8006 (2.4614) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 20:00:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][170/625] eta 0:04:24 lr 0.000194 wd 0.0500 time 0.5791 (0.5820) data time 0.0007 (0.0042) model time 0.5784 (0.5748) loss 6.5277 (7.0984) grad_norm 2.1551 (2.4565) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 20:00:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][180/625] eta 0:04:18 lr 0.000194 wd 0.0500 time 0.5767 (0.5817) data time 0.0006 (0.0040) model time 0.5761 (0.5748) loss 7.4049 (7.1231) grad_norm 2.4472 (2.5047) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 20:00:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][190/625] eta 0:04:12 lr 0.000194 wd 0.0500 time 0.5739 (0.5813) data time 0.0009 (0.0038) model time 0.5731 (0.5747) loss 6.2308 (7.1000) grad_norm 2.7375 (2.5051) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 20:00:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][200/625] eta 0:04:06 lr 0.000194 wd 0.0500 time 0.5739 (0.5810) data time 0.0009 (0.0037) model time 0.5730 (0.5746) loss 5.8163 (7.1064) grad_norm 1.9251 (2.4985) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 20:00:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][210/625] eta 0:04:00 lr 0.000194 wd 0.0500 time 0.5771 (0.5807) data time 0.0007 (0.0036) model time 0.5764 (0.5746) loss 8.0257 (7.1011) grad_norm 3.1532 (2.5190) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 20:00:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][220/625] eta 0:03:55 lr 0.000194 wd 0.0500 time 0.5770 (0.5804) data time 0.0007 (0.0034) model time 0.5763 (0.5745) loss 7.6193 (7.1172) grad_norm 2.8765 (2.5457) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 20:00:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][230/625] eta 0:03:49 lr 0.000194 wd 0.0500 time 0.5748 (0.5801) data time 0.0009 (0.0033) model time 0.5739 (0.5745) loss 7.4041 (7.1084) grad_norm 6.4620 (2.5797) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 20:00:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][240/625] eta 0:03:43 lr 0.000194 wd 0.0500 time 0.7801 (0.5808) data time 0.0009 (0.0032) model time 0.7792 (0.5756) loss 6.6548 (7.1187) grad_norm 4.1347 (2.5984) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 20:00:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][250/625] eta 0:03:38 lr 0.000194 wd 0.0500 time 0.5775 (0.5814) data time 0.0009 (0.0031) model time 0.5766 (0.5766) loss 7.8854 (7.1209) grad_norm 1.5965 (2.5844) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 20:00:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][260/625] eta 0:03:32 lr 0.000194 wd 0.0500 time 0.5773 (0.5812) data time 0.0007 (0.0030) model time 0.5766 (0.5765) loss 7.5084 (7.1024) grad_norm 5.9313 (2.6051) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 20:01:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][270/625] eta 0:03:26 lr 0.000193 wd 0.0500 time 0.5792 (0.5810) data time 0.0006 (0.0030) model time 0.5785 (0.5764) loss 7.6336 (7.0967) grad_norm 2.6294 (2.5995) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 20:01:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][280/625] eta 0:03:20 lr 0.000193 wd 0.0500 time 0.5825 (0.5807) data time 0.0009 (0.0029) model time 0.5817 (0.5762) loss 7.8648 (7.1130) grad_norm 6.4809 (2.6054) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 20:01:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][290/625] eta 0:03:14 lr 0.000193 wd 0.0500 time 0.5735 (0.5805) data time 0.0006 (0.0029) model time 0.5729 (0.5761) loss 7.2752 (7.1147) grad_norm 2.0957 (2.5974) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 20:01:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][300/625] eta 0:03:08 lr 0.000193 wd 0.0500 time 0.5753 (0.5803) data time 0.0009 (0.0028) model time 0.5744 (0.5760) loss 7.6103 (7.1180) grad_norm 2.4383 (2.5811) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 20:01:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][310/625] eta 0:03:02 lr 0.000193 wd 0.0500 time 0.5736 (0.5801) data time 0.0006 (0.0027) model time 0.5729 (0.5759) loss 6.9360 (7.1198) grad_norm 1.9163 (2.6462) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 20:01:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][320/625] eta 0:02:56 lr 0.000193 wd 0.0500 time 0.5797 (0.5801) data time 0.0009 (0.0027) model time 0.5788 (0.5759) loss 8.0370 (7.1225) grad_norm 2.8310 (2.6379) loss_scale 512.0000 (262.3801) mem 22339MB +[2024-07-27 20:01:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][330/625] eta 0:02:51 lr 0.000193 wd 0.0500 time 0.5756 (0.5799) data time 0.0007 (0.0026) model time 0.5750 (0.5759) loss 7.7370 (7.1310) grad_norm 2.1353 (2.6321) loss_scale 512.0000 (269.9215) mem 22339MB +[2024-07-27 20:01:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][340/625] eta 0:02:45 lr 0.000193 wd 0.0500 time 0.5771 (0.5798) data time 0.0007 (0.0026) model time 0.5764 (0.5759) loss 6.4753 (7.1252) grad_norm 1.6911 (2.6586) loss_scale 512.0000 (277.0205) mem 22339MB +[2024-07-27 20:01:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][350/625] eta 0:02:39 lr 0.000193 wd 0.0500 time 0.5742 (0.5796) data time 0.0006 (0.0025) model time 0.5736 (0.5758) loss 6.1454 (7.1319) grad_norm 2.7860 (2.6707) loss_scale 512.0000 (283.7151) mem 22339MB +[2024-07-27 20:01:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][360/625] eta 0:02:33 lr 0.000193 wd 0.0500 time 0.5760 (0.5795) data time 0.0009 (0.0025) model time 0.5750 (0.5757) loss 8.4418 (7.1423) grad_norm 4.1013 (2.6831) loss_scale 512.0000 (290.0388) mem 22339MB +[2024-07-27 20:01:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][370/625] eta 0:02:27 lr 0.000193 wd 0.0500 time 0.5727 (0.5798) data time 0.0009 (0.0024) model time 0.5718 (0.5761) loss 7.0071 (7.1485) grad_norm 1.8289 (2.6801) loss_scale 512.0000 (296.0216) mem 22339MB +[2024-07-27 20:02:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][380/625] eta 0:02:22 lr 0.000193 wd 0.0500 time 0.5751 (0.5797) data time 0.0007 (0.0024) model time 0.5744 (0.5761) loss 6.9143 (7.1483) grad_norm 3.2642 (2.6706) loss_scale 512.0000 (301.6903) mem 22339MB +[2024-07-27 20:02:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][390/625] eta 0:02:16 lr 0.000193 wd 0.0500 time 0.5773 (0.5796) data time 0.0007 (0.0023) model time 0.5767 (0.5761) loss 8.0415 (7.1495) grad_norm 2.0615 (2.6608) loss_scale 512.0000 (307.0691) mem 22339MB +[2024-07-27 20:02:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][400/625] eta 0:02:10 lr 0.000192 wd 0.0500 time 0.5761 (0.5795) data time 0.0007 (0.0023) model time 0.5755 (0.5761) loss 8.5316 (7.1492) grad_norm 2.1481 (2.6778) loss_scale 512.0000 (312.1796) mem 22339MB +[2024-07-27 20:02:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][410/625] eta 0:02:04 lr 0.000192 wd 0.0500 time 0.5842 (0.5794) data time 0.0009 (0.0023) model time 0.5833 (0.5760) loss 6.9533 (7.1404) grad_norm 4.2932 (2.6896) loss_scale 512.0000 (317.0414) mem 22339MB +[2024-07-27 20:02:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][420/625] eta 0:01:58 lr 0.000192 wd 0.0500 time 0.5739 (0.5793) data time 0.0007 (0.0022) model time 0.5732 (0.5759) loss 7.3416 (7.1473) grad_norm 2.6991 (2.6881) loss_scale 512.0000 (321.6722) mem 22339MB +[2024-07-27 20:02:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][430/625] eta 0:01:52 lr 0.000192 wd 0.0500 time 0.5877 (0.5792) data time 0.0009 (0.0022) model time 0.5868 (0.5759) loss 6.4868 (7.1573) grad_norm 2.1522 (2.6822) loss_scale 512.0000 (326.0882) mem 22339MB +[2024-07-27 20:02:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][440/625] eta 0:01:47 lr 0.000192 wd 0.0500 time 0.5790 (0.5791) data time 0.0009 (0.0022) model time 0.5781 (0.5759) loss 8.8838 (7.1616) grad_norm 1.9931 (2.6812) loss_scale 512.0000 (330.3039) mem 22339MB +[2024-07-27 20:02:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][450/625] eta 0:01:41 lr 0.000192 wd 0.0500 time 0.5806 (0.5790) data time 0.0010 (0.0021) model time 0.5796 (0.5758) loss 7.6387 (7.1613) grad_norm 2.8100 (2.6834) loss_scale 512.0000 (334.3326) mem 22339MB +[2024-07-27 20:02:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][460/625] eta 0:01:35 lr 0.000192 wd 0.0500 time 0.5780 (0.5790) data time 0.0008 (0.0021) model time 0.5772 (0.5758) loss 6.8999 (7.1616) grad_norm 2.0980 (2.6891) loss_scale 512.0000 (338.1866) mem 22339MB +[2024-07-27 20:02:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][470/625] eta 0:01:29 lr 0.000192 wd 0.0500 time 0.5743 (0.5801) data time 0.0006 (0.0021) model time 0.5736 (0.5771) loss 6.3145 (7.1596) grad_norm 2.3764 (2.6892) loss_scale 512.0000 (341.8769) mem 22339MB +[2024-07-27 20:03:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][480/625] eta 0:01:24 lr 0.000192 wd 0.0500 time 0.5742 (0.5800) data time 0.0010 (0.0021) model time 0.5732 (0.5771) loss 7.9328 (7.1625) grad_norm 3.7386 (2.6916) loss_scale 512.0000 (345.4137) mem 22339MB +[2024-07-27 20:03:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][490/625] eta 0:01:18 lr 0.000192 wd 0.0500 time 0.5739 (0.5799) data time 0.0009 (0.0020) model time 0.5730 (0.5770) loss 7.6465 (7.1692) grad_norm 2.8773 (2.6961) loss_scale 512.0000 (348.8065) mem 22339MB +[2024-07-27 20:03:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][500/625] eta 0:01:12 lr 0.000192 wd 0.0500 time 0.5731 (0.5798) data time 0.0006 (0.0020) model time 0.5725 (0.5769) loss 7.0811 (7.1743) grad_norm 1.9225 (2.6919) loss_scale 512.0000 (352.0639) mem 22339MB +[2024-07-27 20:03:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][510/625] eta 0:01:06 lr 0.000192 wd 0.0500 time 0.5702 (0.5796) data time 0.0006 (0.0020) model time 0.5696 (0.5768) loss 7.8976 (7.1798) grad_norm 1.9067 (2.6921) loss_scale 512.0000 (355.1937) mem 22339MB +[2024-07-27 20:03:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][520/625] eta 0:01:00 lr 0.000192 wd 0.0500 time 0.5743 (0.5796) data time 0.0009 (0.0020) model time 0.5734 (0.5768) loss 7.3323 (7.1829) grad_norm 2.6556 (2.6847) loss_scale 512.0000 (358.2035) mem 22339MB +[2024-07-27 20:03:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][530/625] eta 0:00:55 lr 0.000191 wd 0.0500 time 0.5806 (0.5796) data time 0.0009 (0.0020) model time 0.5797 (0.5768) loss 7.9198 (7.1841) grad_norm 2.1610 (2.6775) loss_scale 512.0000 (361.0998) mem 22339MB +[2024-07-27 20:03:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][540/625] eta 0:00:49 lr 0.000191 wd 0.0500 time 0.5793 (0.5795) data time 0.0009 (0.0019) model time 0.5784 (0.5768) loss 7.6315 (7.1806) grad_norm 3.2717 (2.6734) loss_scale 512.0000 (363.8891) mem 22339MB +[2024-07-27 20:03:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][550/625] eta 0:00:43 lr 0.000191 wd 0.0500 time 0.5748 (0.5794) data time 0.0007 (0.0019) model time 0.5741 (0.5767) loss 6.4966 (7.1826) grad_norm 2.1271 (2.6649) loss_scale 512.0000 (366.5771) mem 22339MB +[2024-07-27 20:03:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][560/625] eta 0:00:37 lr 0.000191 wd 0.0500 time 0.5722 (0.5794) data time 0.0009 (0.0019) model time 0.5713 (0.5767) loss 6.3341 (7.1811) grad_norm 3.2129 (2.6558) loss_scale 512.0000 (369.1693) mem 22339MB +[2024-07-27 20:03:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][570/625] eta 0:00:31 lr 0.000191 wd 0.0500 time 0.5728 (0.5793) data time 0.0008 (0.0019) model time 0.5719 (0.5766) loss 8.2411 (7.1850) grad_norm 2.4982 (2.6605) loss_scale 512.0000 (371.6708) mem 22339MB +[2024-07-27 20:04:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][580/625] eta 0:00:26 lr 0.000191 wd 0.0500 time 0.5737 (0.5792) data time 0.0009 (0.0019) model time 0.5728 (0.5766) loss 7.0454 (7.1901) grad_norm 1.8767 (2.6590) loss_scale 512.0000 (374.0861) mem 22339MB +[2024-07-27 20:04:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][590/625] eta 0:00:20 lr 0.000191 wd 0.0500 time 0.5708 (0.5793) data time 0.0006 (0.0018) model time 0.5701 (0.5768) loss 8.1354 (7.1942) grad_norm 3.3010 (2.6677) loss_scale 512.0000 (376.4196) mem 22339MB +[2024-07-27 20:04:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][600/625] eta 0:00:14 lr 0.000191 wd 0.0500 time 0.5747 (0.5793) data time 0.0008 (0.0018) model time 0.5739 (0.5767) loss 7.7075 (7.1998) grad_norm 3.6041 (2.6688) loss_scale 512.0000 (378.6755) mem 22339MB +[2024-07-27 20:04:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-27 20:04:15 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 20:04:16 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 20:12:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 20:12:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 20:12:55 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 20:13:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-27 20:13:09 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-27 20:13:10 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-27 20:13:10 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-27 20:13:10 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 228) +[2024-07-27 20:13:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-27 20:13:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][610/625] eta 0:00:32 lr 0.000191 wd 0.0500 time 0.5686 (2.1351) data time 0.0004 (0.1024) model time 0.5682 (2.0327) loss 8.0526 (7.4852) grad_norm 2.0088 (2.7390) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 20:13:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [228/300][620/625] eta 0:00:05 lr 0.000191 wd 0.0500 time 0.5698 (1.1570) data time 0.0006 (0.0389) model time 0.5692 (1.1182) loss 8.6076 (7.4818) grad_norm 3.5736 (2.4799) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-27 20:13:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 228 training takes 0:00:20 +[2024-07-27 20:13:35 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 20:13:41 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 20:13:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.460 (0.460) Loss 0.5132 (0.5132) Acc@1 89.697 (89.697) Acc@5 98.926 (98.926) Mem 22341MB +[2024-07-27 20:13:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.124 (0.154) Loss 0.7632 (0.6167) Acc@1 82.227 (87.620) Acc@5 97.070 (98.020) Mem 22341MB +[2024-07-27 20:13:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.123 (0.140) Loss 0.8276 (0.7040) Acc@1 80.957 (84.942) Acc@5 96.533 (97.210) Mem 22341MB +[2024-07-27 20:13:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.595 Acc@5 97.205 +[2024-07-27 20:13:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.6% +[2024-07-27 20:13:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.932 (0.932) Loss 0.5063 (0.5063) Acc@1 90.527 (90.527) Acc@5 98.975 (98.975) Mem 22341MB +[2024-07-27 20:13:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.123 (0.198) Loss 0.7471 (0.6169) Acc@1 83.398 (87.762) Acc@5 96.826 (98.065) Mem 22341MB +[2024-07-27 20:13:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.123 (0.163) Loss 0.8545 (0.7043) Acc@1 79.541 (84.926) Acc@5 96.094 (97.252) Mem 22341MB +[2024-07-27 20:13:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.541 Acc@5 97.237 +[2024-07-27 20:13:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.5% +[2024-07-27 20:13:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.54% +[2024-07-27 20:13:50 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-27 20:13:57 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-27 20:13:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][0/625] eta 0:10:11 lr 0.000191 wd 0.0500 time 0.9788 (0.9788) data time 0.3285 (0.3285) model time 0.0000 (0.0000) loss 6.7614 (6.7614) grad_norm 1.9324 (1.9324) loss_scale 512.0000 (512.0000) mem 22337MB +[2024-07-27 20:14:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][10/625] eta 0:06:15 lr 0.000191 wd 0.0500 time 0.5684 (0.6109) data time 0.0007 (0.0307) model time 0.0000 (0.0000) loss 5.9981 (7.4192) grad_norm 2.1099 (4.9724) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:14:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][20/625] eta 0:05:59 lr 0.000191 wd 0.0500 time 0.5672 (0.5942) data time 0.0010 (0.0168) model time 0.0000 (0.0000) loss 6.8814 (7.2023) grad_norm 2.9351 (5.1100) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:14:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][30/625] eta 0:05:48 lr 0.000190 wd 0.0500 time 0.5691 (0.5862) data time 0.0006 (0.0117) model time 0.0000 (0.0000) loss 7.7892 (7.1917) grad_norm 5.2417 (4.4959) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:14:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][40/625] eta 0:05:44 lr 0.000190 wd 0.0500 time 0.5670 (0.5882) data time 0.0008 (0.0090) model time 0.0000 (0.0000) loss 7.0299 (7.1376) grad_norm 2.7513 (4.0139) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:14:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][50/625] eta 0:05:36 lr 0.000190 wd 0.0500 time 0.5714 (0.5852) data time 0.0009 (0.0074) model time 0.0000 (0.0000) loss 7.0707 (7.1088) grad_norm 2.1328 (3.6774) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:14:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][60/625] eta 0:05:29 lr 0.000190 wd 0.0500 time 0.5714 (0.5831) data time 0.0010 (0.0064) model time 0.5704 (0.5714) loss 7.6115 (7.1827) grad_norm 2.8301 (3.4716) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:14:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][70/625] eta 0:05:22 lr 0.000190 wd 0.0500 time 0.5877 (0.5818) data time 0.0006 (0.0056) model time 0.5871 (0.5720) loss 7.3078 (7.1758) grad_norm 1.7251 (3.3222) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:14:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][80/625] eta 0:05:16 lr 0.000190 wd 0.0500 time 0.5707 (0.5805) data time 0.0007 (0.0051) model time 0.5699 (0.5716) loss 7.9820 (7.2071) grad_norm 2.2606 (3.2045) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:14:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][90/625] eta 0:05:10 lr 0.000190 wd 0.0500 time 0.5698 (0.5798) data time 0.0008 (0.0046) model time 0.5690 (0.5720) loss 6.7485 (7.1899) grad_norm 2.0113 (3.1469) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:14:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][100/625] eta 0:05:04 lr 0.000190 wd 0.0500 time 0.5735 (0.5791) data time 0.0006 (0.0042) model time 0.5729 (0.5720) loss 6.0616 (7.1831) grad_norm 7.2970 (3.0920) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:15:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][110/625] eta 0:04:58 lr 0.000190 wd 0.0500 time 0.5750 (0.5787) data time 0.0008 (0.0039) model time 0.5742 (0.5722) loss 7.7615 (7.1703) grad_norm 2.4801 (3.0412) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:15:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][120/625] eta 0:04:52 lr 0.000190 wd 0.0500 time 0.5736 (0.5785) data time 0.0007 (0.0037) model time 0.5730 (0.5727) loss 6.5596 (7.1618) grad_norm 2.1517 (3.0479) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:15:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][130/625] eta 0:04:46 lr 0.000190 wd 0.0500 time 0.5726 (0.5781) data time 0.0008 (0.0035) model time 0.5718 (0.5726) loss 7.6885 (7.1591) grad_norm 2.9972 (3.0018) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:15:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][140/625] eta 0:04:40 lr 0.000190 wd 0.0500 time 0.5762 (0.5778) data time 0.0009 (0.0033) model time 0.5753 (0.5726) loss 7.6529 (7.1735) grad_norm 2.4570 (2.9974) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:15:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][150/625] eta 0:04:34 lr 0.000190 wd 0.0500 time 0.5696 (0.5777) data time 0.0008 (0.0032) model time 0.5688 (0.5729) loss 6.8935 (7.1687) grad_norm 1.7333 (2.9703) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:15:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][160/625] eta 0:04:28 lr 0.000189 wd 0.0500 time 0.5703 (0.5775) data time 0.0008 (0.0030) model time 0.5695 (0.5730) loss 7.5079 (7.1571) grad_norm 1.9040 (2.9548) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:15:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][170/625] eta 0:04:22 lr 0.000189 wd 0.0500 time 0.5752 (0.5775) data time 0.0009 (0.0029) model time 0.5744 (0.5733) loss 6.5143 (7.1576) grad_norm 2.4994 (3.0780) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:15:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][180/625] eta 0:04:16 lr 0.000189 wd 0.0500 time 0.5712 (0.5773) data time 0.0009 (0.0028) model time 0.5703 (0.5733) loss 6.8277 (7.1441) grad_norm 1.6661 (3.0222) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:15:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][190/625] eta 0:04:11 lr 0.000189 wd 0.0500 time 0.5734 (0.5773) data time 0.0008 (0.0027) model time 0.5726 (0.5734) loss 8.0625 (7.1389) grad_norm 2.1677 (3.0020) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:15:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][200/625] eta 0:04:05 lr 0.000189 wd 0.0500 time 0.5721 (0.5774) data time 0.0006 (0.0026) model time 0.5715 (0.5738) loss 7.1813 (7.1278) grad_norm 2.7386 (2.9978) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:15:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][210/625] eta 0:03:59 lr 0.000189 wd 0.0500 time 0.5741 (0.5773) data time 0.0006 (0.0025) model time 0.5735 (0.5738) loss 6.1035 (7.1323) grad_norm 2.1098 (3.0009) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:16:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][220/625] eta 0:03:53 lr 0.000189 wd 0.0500 time 0.5725 (0.5772) data time 0.0006 (0.0024) model time 0.5719 (0.5738) loss 6.9008 (7.1313) grad_norm 2.4892 (2.9919) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:16:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][230/625] eta 0:03:47 lr 0.000189 wd 0.0500 time 0.5671 (0.5769) data time 0.0007 (0.0024) model time 0.5664 (0.5737) loss 7.7868 (7.1198) grad_norm 3.0282 (2.9881) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:16:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][240/625] eta 0:03:42 lr 0.000189 wd 0.0500 time 0.5731 (0.5768) data time 0.0006 (0.0023) model time 0.5725 (0.5736) loss 7.4775 (7.1187) grad_norm 2.2061 (3.0007) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:16:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][250/625] eta 0:03:36 lr 0.000189 wd 0.0500 time 0.5741 (0.5766) data time 0.0006 (0.0023) model time 0.5735 (0.5735) loss 6.9975 (7.1084) grad_norm 1.9795 (2.9753) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:16:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][260/625] eta 0:03:30 lr 0.000189 wd 0.0500 time 0.5721 (0.5773) data time 0.0008 (0.0022) model time 0.5713 (0.5745) loss 7.7809 (7.1207) grad_norm 2.4303 (2.9649) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:16:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][270/625] eta 0:03:24 lr 0.000189 wd 0.0500 time 0.5751 (0.5772) data time 0.0007 (0.0022) model time 0.5744 (0.5744) loss 6.6355 (7.1203) grad_norm 1.7326 (2.9534) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:16:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][280/625] eta 0:03:19 lr 0.000189 wd 0.0500 time 0.5739 (0.5770) data time 0.0009 (0.0021) model time 0.5731 (0.5743) loss 7.3622 (7.1164) grad_norm 2.3145 (2.9250) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:16:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][290/625] eta 0:03:13 lr 0.000189 wd 0.0500 time 0.5715 (0.5769) data time 0.0009 (0.0021) model time 0.5705 (0.5742) loss 8.5670 (7.1220) grad_norm 2.0705 (2.9304) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:16:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][300/625] eta 0:03:07 lr 0.000188 wd 0.0500 time 0.5716 (0.5768) data time 0.0006 (0.0020) model time 0.5710 (0.5741) loss 7.5044 (7.1326) grad_norm 2.5488 (2.9184) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:16:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][310/625] eta 0:03:01 lr 0.000188 wd 0.0500 time 0.5743 (0.5767) data time 0.0007 (0.0020) model time 0.5735 (0.5741) loss 6.2485 (7.1313) grad_norm 2.0031 (2.8886) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:17:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][320/625] eta 0:02:55 lr 0.000188 wd 0.0500 time 0.5736 (0.5766) data time 0.0007 (0.0020) model time 0.5728 (0.5740) loss 8.2711 (7.1398) grad_norm 2.8181 (2.8693) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:17:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][330/625] eta 0:02:50 lr 0.000188 wd 0.0500 time 0.5722 (0.5765) data time 0.0007 (0.0020) model time 0.5714 (0.5740) loss 7.7147 (7.1394) grad_norm 2.1073 (2.8664) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:17:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][340/625] eta 0:02:44 lr 0.000188 wd 0.0500 time 0.5698 (0.5764) data time 0.0007 (0.0019) model time 0.5691 (0.5739) loss 7.0173 (7.1400) grad_norm 4.2402 (2.8498) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:17:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][350/625] eta 0:02:38 lr 0.000188 wd 0.0500 time 0.5719 (0.5763) data time 0.0010 (0.0019) model time 0.5709 (0.5738) loss 6.1544 (7.1358) grad_norm 1.8156 (2.8330) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:17:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][360/625] eta 0:02:32 lr 0.000188 wd 0.0500 time 0.5727 (0.5762) data time 0.0006 (0.0019) model time 0.5720 (0.5738) loss 5.9548 (7.1352) grad_norm 2.3838 (2.8289) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:17:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][370/625] eta 0:02:26 lr 0.000188 wd 0.0500 time 0.5742 (0.5762) data time 0.0006 (0.0018) model time 0.5736 (0.5738) loss 8.0708 (7.1290) grad_norm 3.0595 (2.8259) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:17:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][380/625] eta 0:02:21 lr 0.000188 wd 0.0500 time 0.5724 (0.5761) data time 0.0008 (0.0018) model time 0.5716 (0.5738) loss 6.8238 (7.1329) grad_norm 2.3297 (2.8405) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:17:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][390/625] eta 0:02:15 lr 0.000188 wd 0.0500 time 0.5710 (0.5761) data time 0.0008 (0.0018) model time 0.5701 (0.5738) loss 8.0381 (7.1381) grad_norm 1.9918 (2.8614) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:17:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][400/625] eta 0:02:09 lr 0.000188 wd 0.0500 time 0.5737 (0.5760) data time 0.0006 (0.0018) model time 0.5731 (0.5737) loss 7.9017 (7.1332) grad_norm 1.8557 (2.8515) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:17:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][410/625] eta 0:02:03 lr 0.000188 wd 0.0500 time 0.5737 (0.5759) data time 0.0006 (0.0017) model time 0.5731 (0.5737) loss 7.6399 (7.1396) grad_norm 1.6054 (2.8366) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:17:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][420/625] eta 0:01:58 lr 0.000188 wd 0.0500 time 0.5721 (0.5762) data time 0.0006 (0.0017) model time 0.5715 (0.5740) loss 8.4760 (7.1428) grad_norm 2.0369 (2.8206) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:18:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][430/625] eta 0:01:52 lr 0.000187 wd 0.0500 time 0.5739 (0.5762) data time 0.0007 (0.0017) model time 0.5732 (0.5740) loss 7.4067 (7.1412) grad_norm 2.2554 (2.8005) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:18:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][440/625] eta 0:01:46 lr 0.000187 wd 0.0500 time 0.5716 (0.5761) data time 0.0009 (0.0017) model time 0.5707 (0.5740) loss 7.2952 (7.1305) grad_norm 2.2018 (2.7913) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:18:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][450/625] eta 0:01:40 lr 0.000187 wd 0.0500 time 0.5730 (0.5761) data time 0.0009 (0.0017) model time 0.5722 (0.5740) loss 7.2156 (7.1150) grad_norm 2.0106 (2.7821) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:18:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][460/625] eta 0:01:35 lr 0.000187 wd 0.0500 time 0.5720 (0.5760) data time 0.0006 (0.0017) model time 0.5714 (0.5739) loss 7.6301 (7.1159) grad_norm 3.2971 (2.7701) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:18:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][470/625] eta 0:01:29 lr 0.000187 wd 0.0500 time 0.5705 (0.5760) data time 0.0009 (0.0016) model time 0.5696 (0.5739) loss 8.1138 (7.1266) grad_norm 2.1715 (2.7644) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:18:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][480/625] eta 0:01:23 lr 0.000187 wd 0.0500 time 0.5726 (0.5762) data time 0.0008 (0.0016) model time 0.5718 (0.5742) loss 7.3371 (7.1232) grad_norm 2.4598 (2.7694) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:18:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][490/625] eta 0:01:17 lr 0.000187 wd 0.0500 time 0.5718 (0.5762) data time 0.0006 (0.0016) model time 0.5712 (0.5742) loss 7.6304 (7.1315) grad_norm 2.4796 (2.7881) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:18:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][500/625] eta 0:01:12 lr 0.000187 wd 0.0500 time 0.5713 (0.5761) data time 0.0007 (0.0016) model time 0.5706 (0.5741) loss 6.6677 (7.1331) grad_norm 2.8870 (2.7805) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:18:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][510/625] eta 0:01:06 lr 0.000187 wd 0.0500 time 0.5711 (0.5760) data time 0.0009 (0.0016) model time 0.5702 (0.5741) loss 7.5704 (7.1288) grad_norm 2.6118 (2.7714) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:18:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][520/625] eta 0:01:00 lr 0.000187 wd 0.0500 time 0.5706 (0.5760) data time 0.0010 (0.0016) model time 0.5696 (0.5740) loss 8.7238 (7.1256) grad_norm 3.1396 (2.7643) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:19:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][530/625] eta 0:00:54 lr 0.000187 wd 0.0500 time 0.5739 (0.5759) data time 0.0006 (0.0016) model time 0.5733 (0.5740) loss 7.1117 (7.1229) grad_norm 2.7708 (2.7552) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:19:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][540/625] eta 0:00:48 lr 0.000187 wd 0.0500 time 0.5702 (0.5759) data time 0.0009 (0.0015) model time 0.5693 (0.5740) loss 7.0842 (7.1327) grad_norm 2.5572 (2.7459) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:19:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][550/625] eta 0:00:43 lr 0.000187 wd 0.0500 time 0.5705 (0.5758) data time 0.0008 (0.0015) model time 0.5697 (0.5740) loss 6.7737 (7.1354) grad_norm 2.4195 (2.7383) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:19:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][560/625] eta 0:00:37 lr 0.000186 wd 0.0500 time 0.5743 (0.5758) data time 0.0006 (0.0015) model time 0.5736 (0.5740) loss 7.4232 (7.1374) grad_norm 2.3765 (2.7281) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:19:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][570/625] eta 0:00:31 lr 0.000186 wd 0.0500 time 0.5726 (0.5758) data time 0.0008 (0.0015) model time 0.5718 (0.5739) loss 8.2942 (7.1431) grad_norm 2.3151 (2.7267) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:19:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][580/625] eta 0:00:25 lr 0.000186 wd 0.0500 time 0.5714 (0.5758) data time 0.0009 (0.0015) model time 0.5706 (0.5739) loss 6.6604 (7.1427) grad_norm 11.8910 (2.7348) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:19:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][590/625] eta 0:00:20 lr 0.000186 wd 0.0500 time 0.5725 (0.5757) data time 0.0008 (0.0015) model time 0.5717 (0.5739) loss 7.7922 (7.1413) grad_norm 2.4196 (2.7307) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:19:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][600/625] eta 0:00:14 lr 0.000186 wd 0.0500 time 0.5705 (0.5757) data time 0.0007 (0.0015) model time 0.5698 (0.5739) loss 6.9392 (7.1424) grad_norm 2.2321 (2.7337) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:19:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][610/625] eta 0:00:08 lr 0.000186 wd 0.0500 time 0.5732 (0.5756) data time 0.0007 (0.0015) model time 0.5726 (0.5738) loss 7.9642 (7.1427) grad_norm 9.3045 (2.7518) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:19:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [229/300][620/625] eta 0:00:02 lr 0.000186 wd 0.0500 time 0.5718 (0.5756) data time 0.0004 (0.0015) model time 0.5715 (0.5738) loss 6.0866 (7.1357) grad_norm 2.1257 (2.7479) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:19:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 229 training takes 0:05:59 +[2024-07-27 20:19:56 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 20:19:58 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 20:19:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.466 (0.466) Loss 0.4998 (0.4998) Acc@1 90.234 (90.234) Acc@5 98.828 (98.828) Mem 22339MB +[2024-07-27 20:20:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.156) Loss 0.7432 (0.6108) Acc@1 82.666 (87.753) Acc@5 96.924 (98.082) Mem 22339MB +[2024-07-27 20:20:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8394 (0.7010) Acc@1 81.152 (85.031) Acc@5 96.045 (97.221) Mem 22339MB +[2024-07-27 20:20:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.633 Acc@5 97.231 +[2024-07-27 20:20:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.6% +[2024-07-27 20:20:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.833 (0.833) Loss 0.5059 (0.5059) Acc@1 90.576 (90.576) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-27 20:20:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.190) Loss 0.7466 (0.6164) Acc@1 83.350 (87.775) Acc@5 96.826 (98.065) Mem 22339MB +[2024-07-27 20:20:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.159) Loss 0.8525 (0.7037) Acc@1 79.590 (84.952) Acc@5 96.094 (97.259) Mem 22339MB +[2024-07-27 20:20:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.555 Acc@5 97.245 +[2024-07-27 20:20:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.6% +[2024-07-27 20:20:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.55% +[2024-07-27 20:20:05 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-27 20:20:07 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-27 20:20:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][0/625] eta 0:09:34 lr 0.000186 wd 0.0500 time 0.9199 (0.9199) data time 0.3989 (0.3989) model time 0.0000 (0.0000) loss 8.8216 (8.8216) grad_norm 1.8831 (1.8831) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:20:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][10/625] eta 0:06:11 lr 0.000186 wd 0.0500 time 0.5701 (0.6042) data time 0.0009 (0.0371) model time 0.0000 (0.0000) loss 6.7767 (7.0977) grad_norm 2.3153 (2.2220) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:20:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][20/625] eta 0:05:56 lr 0.000186 wd 0.0500 time 0.5757 (0.5899) data time 0.0006 (0.0199) model time 0.0000 (0.0000) loss 8.1916 (7.0067) grad_norm 2.3880 (2.3388) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:20:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][30/625] eta 0:05:47 lr 0.000186 wd 0.0500 time 0.5733 (0.5848) data time 0.0008 (0.0137) model time 0.0000 (0.0000) loss 7.0372 (7.2253) grad_norm 3.1727 (2.4458) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:20:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][40/625] eta 0:05:40 lr 0.000186 wd 0.0500 time 0.5715 (0.5822) data time 0.0006 (0.0106) model time 0.0000 (0.0000) loss 7.6932 (7.2091) grad_norm 2.2962 (2.3927) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:20:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][50/625] eta 0:05:33 lr 0.000186 wd 0.0500 time 0.5725 (0.5806) data time 0.0006 (0.0087) model time 0.0000 (0.0000) loss 6.5516 (7.1705) grad_norm 2.5584 (2.5359) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:20:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][60/625] eta 0:05:27 lr 0.000186 wd 0.0500 time 0.5717 (0.5797) data time 0.0006 (0.0075) model time 0.5711 (0.5737) loss 7.5248 (7.1555) grad_norm 1.7825 (2.4627) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:20:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][70/625] eta 0:05:22 lr 0.000185 wd 0.0500 time 0.7341 (0.5816) data time 0.0009 (0.0066) model time 0.7331 (0.5831) loss 6.9226 (7.1274) grad_norm 5.0376 (2.5941) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:20:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][80/625] eta 0:05:16 lr 0.000185 wd 0.0500 time 0.5711 (0.5809) data time 0.0008 (0.0059) model time 0.5703 (0.5803) loss 7.4953 (7.1259) grad_norm 2.7252 (2.6052) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:20:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][90/625] eta 0:05:10 lr 0.000185 wd 0.0500 time 0.5721 (0.5803) data time 0.0006 (0.0054) model time 0.5715 (0.5788) loss 7.8352 (7.1233) grad_norm 1.9510 (2.6098) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:21:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][100/625] eta 0:05:04 lr 0.000185 wd 0.0500 time 0.5722 (0.5796) data time 0.0007 (0.0050) model time 0.5715 (0.5774) loss 7.2904 (7.1556) grad_norm 2.3639 (2.5755) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:21:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][110/625] eta 0:04:58 lr 0.000185 wd 0.0500 time 0.5728 (0.5792) data time 0.0007 (0.0046) model time 0.5721 (0.5769) loss 7.9549 (7.1319) grad_norm 1.9651 (2.5558) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:21:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][120/625] eta 0:04:52 lr 0.000185 wd 0.0500 time 0.5686 (0.5794) data time 0.0009 (0.0044) model time 0.5677 (0.5771) loss 7.7376 (7.1538) grad_norm 2.3549 (2.5291) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:21:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][130/625] eta 0:04:46 lr 0.000185 wd 0.0500 time 0.5711 (0.5794) data time 0.0008 (0.0042) model time 0.5702 (0.5773) loss 7.6507 (7.1796) grad_norm 1.7985 (2.5115) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:21:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][140/625] eta 0:04:41 lr 0.000185 wd 0.0500 time 0.5719 (0.5796) data time 0.0009 (0.0040) model time 0.5710 (0.5777) loss 7.0103 (7.1751) grad_norm 2.1806 (2.5636) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:21:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][150/625] eta 0:04:35 lr 0.000185 wd 0.0500 time 0.5724 (0.5795) data time 0.0007 (0.0038) model time 0.5717 (0.5777) loss 6.0962 (7.1740) grad_norm 2.3392 (2.5575) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:21:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][160/625] eta 0:04:29 lr 0.000185 wd 0.0500 time 0.5749 (0.5793) data time 0.0007 (0.0036) model time 0.5742 (0.5774) loss 5.4397 (7.1566) grad_norm 2.3575 (2.5548) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:21:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][170/625] eta 0:04:23 lr 0.000185 wd 0.0500 time 0.5725 (0.5792) data time 0.0006 (0.0035) model time 0.5719 (0.5774) loss 6.7654 (7.1510) grad_norm 2.0286 (2.5596) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:21:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][180/625] eta 0:04:18 lr 0.000185 wd 0.0500 time 0.5717 (0.5799) data time 0.0009 (0.0033) model time 0.5709 (0.5784) loss 5.7410 (7.1515) grad_norm 12.1304 (2.5977) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 20:21:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-27 20:21:52 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 20:21:53 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 20:27:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 20:27:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 20:28:05 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 20:28:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-27 20:28:16 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-27 20:28:17 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-27 20:28:17 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-27 20:28:17 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 230) +[2024-07-27 20:28:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-27 20:28:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][190/625] eta 0:12:28 lr 0.000185 wd 0.0500 time 0.6034 (1.7215) data time 0.0010 (0.0614) model time 0.6024 (1.6601) loss 9.1372 (7.6648) grad_norm 3.3255 (3.1900) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 20:28:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][200/625] eta 0:08:15 lr 0.000184 wd 0.0500 time 0.6378 (1.1652) data time 0.0008 (0.0312) model time 0.6370 (1.1339) loss 7.1695 (7.4256) grad_norm 1.5651 (2.9929) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 20:28:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][210/625] eta 0:06:45 lr 0.000184 wd 0.0500 time 0.5971 (0.9772) data time 0.0010 (0.0212) model time 0.5960 (0.9560) loss 7.8021 (7.5142) grad_norm 4.3224 (3.0158) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 20:28:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][220/625] eta 0:05:57 lr 0.000184 wd 0.0500 time 0.6053 (0.8829) data time 0.0009 (0.0162) model time 0.6044 (0.8667) loss 6.0693 (7.3646) grad_norm 9.8566 (3.1971) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 20:29:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][230/625] eta 0:05:26 lr 0.000184 wd 0.0500 time 0.6598 (0.8274) data time 0.0010 (0.0132) model time 0.6587 (0.8142) loss 6.8976 (7.3552) grad_norm 4.7271 (3.2769) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 20:29:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][240/625] eta 0:05:06 lr 0.000184 wd 0.0500 time 0.6101 (0.7961) data time 0.0008 (0.0112) model time 0.6093 (0.7850) loss 6.8785 (7.2872) grad_norm 1.9272 (3.2437) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 20:29:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][250/625] eta 0:04:48 lr 0.000184 wd 0.0500 time 0.6056 (0.7689) data time 0.0008 (0.0097) model time 0.6048 (0.7592) loss 6.7357 (7.2726) grad_norm 2.3691 (3.1049) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 20:29:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][260/625] eta 0:04:33 lr 0.000184 wd 0.0500 time 0.6143 (0.7492) data time 0.0010 (0.0086) model time 0.6132 (0.7406) loss 7.6844 (7.2873) grad_norm 2.0029 (3.0137) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 20:29:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][270/625] eta 0:04:20 lr 0.000184 wd 0.0500 time 0.6065 (0.7339) data time 0.0008 (0.0078) model time 0.6058 (0.7261) loss 7.4531 (7.2614) grad_norm 2.3031 (2.9319) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 20:29:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][280/625] eta 0:04:08 lr 0.000184 wd 0.0500 time 0.5997 (0.7209) data time 0.0010 (0.0071) model time 0.5987 (0.7138) loss 7.6055 (7.2947) grad_norm 1.8011 (2.8781) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 20:29:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][290/625] eta 0:03:57 lr 0.000184 wd 0.0500 time 0.6058 (0.7102) data time 0.0009 (0.0066) model time 0.6049 (0.7036) loss 6.6411 (7.2826) grad_norm 2.5520 (2.8113) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 20:29:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][300/625] eta 0:03:47 lr 0.000184 wd 0.0500 time 0.6035 (0.7012) data time 0.0009 (0.0061) model time 0.6026 (0.6951) loss 7.3685 (7.2839) grad_norm 2.1955 (2.8157) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 20:29:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][310/625] eta 0:03:38 lr 0.000184 wd 0.0500 time 0.6010 (0.6936) data time 0.0008 (0.0057) model time 0.6002 (0.6879) loss 6.5706 (7.2587) grad_norm 2.1539 (2.7821) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 20:29:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][320/625] eta 0:03:29 lr 0.000184 wd 0.0500 time 0.6072 (0.6874) data time 0.0008 (0.0054) model time 0.6064 (0.6820) loss 6.2190 (7.2518) grad_norm 2.7548 (2.7659) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 20:30:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][330/625] eta 0:03:21 lr 0.000183 wd 0.0500 time 0.6103 (0.6821) data time 0.0009 (0.0051) model time 0.6094 (0.6770) loss 7.5426 (7.2301) grad_norm 2.3729 (2.7831) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 20:30:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][340/625] eta 0:03:13 lr 0.000183 wd 0.0500 time 0.6042 (0.6773) data time 0.0011 (0.0048) model time 0.6031 (0.6725) loss 7.4328 (7.2344) grad_norm 1.9822 (2.7534) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 20:30:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][350/625] eta 0:03:05 lr 0.000183 wd 0.0500 time 0.6020 (0.6731) data time 0.0008 (0.0046) model time 0.6012 (0.6685) loss 6.5278 (7.2526) grad_norm 2.3801 (2.7221) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 20:30:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][360/625] eta 0:02:57 lr 0.000183 wd 0.0500 time 0.6046 (0.6692) data time 0.0008 (0.0044) model time 0.6038 (0.6648) loss 7.2565 (7.2402) grad_norm 3.2733 (2.7270) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 20:30:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][370/625] eta 0:02:49 lr 0.000183 wd 0.0500 time 0.6106 (0.6657) data time 0.0008 (0.0042) model time 0.6098 (0.6615) loss 6.5244 (7.2345) grad_norm 2.3617 (2.7624) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 20:30:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][380/625] eta 0:02:42 lr 0.000183 wd 0.0500 time 0.6018 (0.6626) data time 0.0010 (0.0041) model time 0.6008 (0.6585) loss 7.2317 (7.1962) grad_norm 2.3923 (2.7576) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 20:30:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][390/625] eta 0:02:35 lr 0.000183 wd 0.0500 time 0.6087 (0.6599) data time 0.0008 (0.0039) model time 0.6079 (0.6559) loss 6.9358 (7.1876) grad_norm 3.6268 (2.7461) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 20:30:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][400/625] eta 0:02:27 lr 0.000183 wd 0.0500 time 0.6063 (0.6575) data time 0.0011 (0.0038) model time 0.6051 (0.6537) loss 6.5400 (7.1760) grad_norm 3.7045 (2.7594) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 20:30:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][410/625] eta 0:02:20 lr 0.000183 wd 0.0500 time 0.6083 (0.6554) data time 0.0010 (0.0037) model time 0.6073 (0.6517) loss 9.1471 (7.1817) grad_norm 2.4434 (2.7663) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 20:30:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][420/625] eta 0:02:13 lr 0.000183 wd 0.0500 time 0.6084 (0.6534) data time 0.0010 (0.0036) model time 0.6074 (0.6498) loss 7.6217 (7.1669) grad_norm 3.0390 (2.7852) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 20:31:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][430/625] eta 0:02:06 lr 0.000183 wd 0.0500 time 0.6017 (0.6512) data time 0.0009 (0.0035) model time 0.6008 (0.6478) loss 6.1039 (7.1571) grad_norm 1.9142 (2.7761) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 20:31:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-27 20:31:07 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 20:31:13 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 20:36:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 20:36:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 20:36:29 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 20:36:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-27 20:36:42 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-27 20:36:42 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-27 20:36:42 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-27 20:36:42 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 230) +[2024-07-27 20:36:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-27 20:36:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][440/625] eta 0:07:19 lr 0.000183 wd 0.0500 time 0.5641 (2.3751) data time 0.0006 (0.1597) model time 0.5634 (2.2154) loss 8.3347 (7.8519) grad_norm 3.3685 (2.2858) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 20:37:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][450/625] eta 0:03:24 lr 0.000183 wd 0.0500 time 0.5637 (1.1679) data time 0.0008 (0.0539) model time 0.5629 (1.1140) loss 8.0346 (7.5528) grad_norm 2.2278 (2.8350) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-27 20:37:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][460/625] eta 0:02:32 lr 0.000183 wd 0.0500 time 0.5628 (0.9259) data time 0.0009 (0.0327) model time 0.5619 (0.8932) loss 7.4543 (7.4995) grad_norm 4.3371 (inf) loss_scale 256.0000 (430.0800) mem 22344MB +[2024-07-27 20:37:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][470/625] eta 0:02:07 lr 0.000182 wd 0.0500 time 0.5667 (0.8228) data time 0.0010 (0.0236) model time 0.5657 (0.7993) loss 6.3072 (7.4357) grad_norm 3.2862 (inf) loss_scale 256.0000 (380.3429) mem 22344MB +[2024-07-27 20:37:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][480/625] eta 0:01:51 lr 0.000182 wd 0.0500 time 0.5657 (0.7656) data time 0.0009 (0.0186) model time 0.5648 (0.7471) loss 7.5245 (7.3793) grad_norm 1.6592 (inf) loss_scale 256.0000 (352.7111) mem 22344MB +[2024-07-27 20:37:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][490/625] eta 0:01:39 lr 0.000182 wd 0.0500 time 0.7698 (0.7359) data time 0.0006 (0.0153) model time 0.7692 (0.7205) loss 6.3379 (7.3440) grad_norm 2.1106 (inf) loss_scale 256.0000 (335.1273) mem 22344MB +[2024-07-27 20:37:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][500/625] eta 0:01:28 lr 0.000182 wd 0.0500 time 0.5677 (0.7100) data time 0.0009 (0.0131) model time 0.5669 (0.6969) loss 7.5973 (7.2889) grad_norm 1.6671 (inf) loss_scale 256.0000 (322.9538) mem 22344MB +[2024-07-27 20:37:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][510/625] eta 0:01:19 lr 0.000182 wd 0.0500 time 0.5676 (0.6912) data time 0.0009 (0.0115) model time 0.5667 (0.6797) loss 6.5564 (7.2721) grad_norm 2.9081 (inf) loss_scale 256.0000 (314.0267) mem 22344MB +[2024-07-27 20:37:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][520/625] eta 0:01:11 lr 0.000182 wd 0.0500 time 0.5706 (0.6770) data time 0.0008 (0.0103) model time 0.5698 (0.6667) loss 7.7194 (7.2665) grad_norm 1.6985 (inf) loss_scale 256.0000 (307.2000) mem 22344MB +[2024-07-27 20:37:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][530/625] eta 0:01:03 lr 0.000182 wd 0.0500 time 0.5700 (0.6658) data time 0.0008 (0.0093) model time 0.5692 (0.6565) loss 7.5924 (7.2563) grad_norm 2.2622 (inf) loss_scale 256.0000 (301.8105) mem 22344MB +[2024-07-27 20:37:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][540/625] eta 0:00:55 lr 0.000182 wd 0.0500 time 0.5686 (0.6566) data time 0.0009 (0.0085) model time 0.5677 (0.6481) loss 7.4238 (7.2671) grad_norm 2.2309 (inf) loss_scale 256.0000 (297.4476) mem 22344MB +[2024-07-27 20:38:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][550/625] eta 0:00:48 lr 0.000182 wd 0.0500 time 0.5699 (0.6491) data time 0.0007 (0.0078) model time 0.5692 (0.6412) loss 6.7750 (7.2468) grad_norm 1.8990 (inf) loss_scale 256.0000 (293.8435) mem 22344MB +[2024-07-27 20:38:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][560/625] eta 0:00:41 lr 0.000182 wd 0.0500 time 0.5713 (0.6430) data time 0.0006 (0.0073) model time 0.5707 (0.6357) loss 6.8907 (7.2555) grad_norm 2.9421 (inf) loss_scale 256.0000 (290.8160) mem 22344MB +[2024-07-27 20:38:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][570/625] eta 0:00:35 lr 0.000182 wd 0.0500 time 0.5703 (0.6377) data time 0.0006 (0.0068) model time 0.5697 (0.6309) loss 6.5252 (7.2761) grad_norm 2.2187 (inf) loss_scale 256.0000 (288.2370) mem 22344MB +[2024-07-27 20:38:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][580/625] eta 0:00:28 lr 0.000182 wd 0.0500 time 0.5725 (0.6331) data time 0.0009 (0.0064) model time 0.5716 (0.6268) loss 6.5093 (7.2394) grad_norm 1.8237 (inf) loss_scale 256.0000 (286.0138) mem 22344MB +[2024-07-27 20:38:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][590/625] eta 0:00:22 lr 0.000182 wd 0.0500 time 0.5702 (0.6292) data time 0.0009 (0.0060) model time 0.5693 (0.6232) loss 7.8918 (7.2310) grad_norm 2.6692 (inf) loss_scale 256.0000 (284.0774) mem 22344MB +[2024-07-27 20:38:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][600/625] eta 0:00:15 lr 0.000181 wd 0.0500 time 0.5725 (0.6258) data time 0.0009 (0.0057) model time 0.5717 (0.6201) loss 7.8495 (7.2256) grad_norm 2.1530 (inf) loss_scale 256.0000 (282.3758) mem 22344MB +[2024-07-27 20:38:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][610/625] eta 0:00:09 lr 0.000181 wd 0.0500 time 0.5701 (0.6227) data time 0.0004 (0.0055) model time 0.5697 (0.6172) loss 7.0342 (7.2182) grad_norm 2.3752 (inf) loss_scale 256.0000 (280.8686) mem 22344MB +[2024-07-27 20:38:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [230/300][620/625] eta 0:00:03 lr 0.000181 wd 0.0500 time 0.5701 (0.6198) data time 0.0006 (0.0052) model time 0.5695 (0.6146) loss 6.9675 (7.2093) grad_norm 5.3126 (inf) loss_scale 256.0000 (279.5243) mem 22344MB +[2024-07-27 20:38:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 230 training takes 0:01:56 +[2024-07-27 20:38:44 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 20:38:49 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 20:38:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.460 (0.460) Loss 0.5015 (0.5015) Acc@1 90.283 (90.283) Acc@5 99.121 (99.121) Mem 22344MB +[2024-07-27 20:38:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.157) Loss 0.7549 (0.6178) Acc@1 82.666 (87.695) Acc@5 97.021 (98.096) Mem 22344MB +[2024-07-27 20:38:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.8623 (0.7070) Acc@1 79.834 (85.033) Acc@5 96.045 (97.224) Mem 22344MB +[2024-07-27 20:38:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.641 Acc@5 97.217 +[2024-07-27 20:38:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.6% +[2024-07-27 20:38:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.762 (0.762) Loss 0.5059 (0.5059) Acc@1 90.625 (90.625) Acc@5 98.975 (98.975) Mem 22344MB +[2024-07-27 20:38:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.183) Loss 0.7461 (0.6161) Acc@1 83.398 (87.820) Acc@5 96.826 (98.065) Mem 22344MB +[2024-07-27 20:38:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.156) Loss 0.8525 (0.7033) Acc@1 79.688 (84.980) Acc@5 96.191 (97.268) Mem 22344MB +[2024-07-27 20:38:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.587 Acc@5 97.251 +[2024-07-27 20:38:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.6% +[2024-07-27 20:38:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.59% +[2024-07-27 20:38:58 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-27 20:39:02 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-27 20:39:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][0/625] eta 0:14:24 lr 0.000181 wd 0.0500 time 1.3830 (1.3830) data time 0.3224 (0.3224) model time 0.0000 (0.0000) loss 6.5013 (6.5013) grad_norm 3.0436 (3.0436) loss_scale 256.0000 (256.0000) mem 22337MB +[2024-07-27 20:39:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-27 20:39:08 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 20:39:13 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 20:42:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 20:42:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 20:43:02 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 20:43:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-27 20:43:17 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-27 20:43:17 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-27 20:43:17 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-27 20:43:18 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 231) +[2024-07-27 20:43:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-27 20:43:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][10/625] eta 1:06:09 lr 0.000181 wd 0.0500 time 1.8517 (6.4548) data time 0.0010 (0.4048) model time 0.0000 (0.0000) loss 7.1613 (7.5141) grad_norm 2.8394 (3.1610) loss_scale 256.0000 (256.0000) mem 22342MB +[2024-07-27 20:43:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][20/625] eta 0:15:47 lr 0.000181 wd 0.0500 time 0.5911 (1.5666) data time 0.0008 (0.0684) model time 0.0000 (0.0000) loss 6.2733 (7.4391) grad_norm 3.6464 (2.7370) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:43:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][30/625] eta 0:11:08 lr 0.000181 wd 0.0500 time 0.5804 (1.1232) data time 0.0010 (0.0378) model time 0.0000 (0.0000) loss 7.7726 (7.3890) grad_norm 2.3701 (2.9232) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:43:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][40/625] eta 0:09:18 lr 0.000181 wd 0.0500 time 0.5887 (0.9551) data time 0.0008 (0.0263) model time 0.0000 (0.0000) loss 7.2838 (7.4041) grad_norm 1.8972 (3.0164) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:43:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][50/625] eta 0:08:18 lr 0.000181 wd 0.0500 time 0.5849 (0.8674) data time 0.0010 (0.0203) model time 0.0000 (0.0000) loss 8.0044 (7.3020) grad_norm 2.8333 (2.8329) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:44:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][60/625] eta 0:07:41 lr 0.000181 wd 0.0500 time 0.5534 (0.8168) data time 0.0008 (0.0166) model time 0.5527 (0.6034) loss 6.3477 (7.2846) grad_norm 3.2221 (2.7853) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:44:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][70/625] eta 0:07:15 lr 0.000181 wd 0.0500 time 0.5910 (0.7838) data time 0.0008 (0.0144) model time 0.5902 (0.6062) loss 7.8422 (7.2260) grad_norm 2.2726 (2.7422) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:44:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][80/625] eta 0:06:52 lr 0.000181 wd 0.0500 time 0.5935 (0.7571) data time 0.0011 (0.0126) model time 0.5924 (0.6009) loss 7.4979 (7.2160) grad_norm 3.7831 (2.6681) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:44:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][90/625] eta 0:06:34 lr 0.000181 wd 0.0500 time 0.5891 (0.7367) data time 0.0010 (0.0112) model time 0.5881 (0.5977) loss 7.1384 (7.2338) grad_norm 2.5746 (2.7553) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:44:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][100/625] eta 0:06:18 lr 0.000181 wd 0.0500 time 0.5931 (0.7209) data time 0.0008 (0.0101) model time 0.5923 (0.5963) loss 6.4848 (7.2376) grad_norm 1.6182 (2.7323) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:44:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][110/625] eta 0:06:04 lr 0.000180 wd 0.0500 time 0.5864 (0.7078) data time 0.0008 (0.0092) model time 0.5856 (0.5946) loss 7.9535 (7.2981) grad_norm 2.3773 (2.6996) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:44:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][120/625] eta 0:05:51 lr 0.000180 wd 0.0500 time 0.5876 (0.6969) data time 0.0011 (0.0085) model time 0.5865 (0.5933) loss 7.4960 (7.2782) grad_norm 2.0109 (2.6445) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:44:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][130/625] eta 0:05:40 lr 0.000180 wd 0.0500 time 0.5881 (0.6879) data time 0.0009 (0.0079) model time 0.5872 (0.5923) loss 6.6317 (7.2610) grad_norm 1.9579 (2.6431) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:44:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][140/625] eta 0:05:29 lr 0.000180 wd 0.0500 time 0.5894 (0.6802) data time 0.0011 (0.0074) model time 0.5883 (0.5915) loss 8.3119 (7.2698) grad_norm 2.2921 (2.6627) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:44:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][150/625] eta 0:05:20 lr 0.000180 wd 0.0500 time 0.5938 (0.6738) data time 0.0011 (0.0069) model time 0.5927 (0.5912) loss 7.5593 (7.2541) grad_norm 2.3941 (2.6567) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:45:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][160/625] eta 0:05:10 lr 0.000180 wd 0.0500 time 0.5874 (0.6683) data time 0.0010 (0.0066) model time 0.5864 (0.5909) loss 8.7988 (7.2526) grad_norm 2.4157 (2.6648) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:45:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][170/625] eta 0:05:01 lr 0.000180 wd 0.0500 time 0.6004 (0.6634) data time 0.0010 (0.0062) model time 0.5994 (0.5907) loss 8.6328 (7.2833) grad_norm 2.0621 (2.6480) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:45:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][180/625] eta 0:04:53 lr 0.000180 wd 0.0500 time 0.5867 (0.6591) data time 0.0011 (0.0059) model time 0.5857 (0.5906) loss 6.9900 (7.2795) grad_norm 2.0383 (2.6307) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:45:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][190/625] eta 0:04:45 lr 0.000180 wd 0.0500 time 0.5897 (0.6552) data time 0.0010 (0.0057) model time 0.5887 (0.5903) loss 7.9884 (7.2610) grad_norm 2.5873 (2.6448) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:45:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][200/625] eta 0:04:37 lr 0.000180 wd 0.0500 time 0.5914 (0.6518) data time 0.0010 (0.0054) model time 0.5904 (0.5901) loss 7.9385 (7.2452) grad_norm 2.0558 (2.6405) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:45:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][210/625] eta 0:04:29 lr 0.000180 wd 0.0500 time 0.5883 (0.6487) data time 0.0008 (0.0052) model time 0.5875 (0.5900) loss 7.1444 (7.2284) grad_norm 2.3283 (2.6272) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:45:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][220/625] eta 0:04:21 lr 0.000180 wd 0.0500 time 0.5897 (0.6459) data time 0.0011 (0.0050) model time 0.5886 (0.5900) loss 6.9589 (7.2243) grad_norm 2.9204 (2.6166) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:45:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][230/625] eta 0:04:14 lr 0.000180 wd 0.0500 time 0.5896 (0.6435) data time 0.0008 (0.0048) model time 0.5887 (0.5900) loss 7.4960 (7.2057) grad_norm 2.3567 (2.6143) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:45:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][240/625] eta 0:04:06 lr 0.000180 wd 0.0500 time 0.5938 (0.6413) data time 0.0008 (0.0047) model time 0.5930 (0.5901) loss 7.3882 (7.2104) grad_norm 2.7613 (2.6026) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:45:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][250/625] eta 0:03:59 lr 0.000179 wd 0.0500 time 0.5890 (0.6394) data time 0.0008 (0.0045) model time 0.5882 (0.5903) loss 7.3643 (7.2033) grad_norm 1.8967 (2.5929) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:46:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][260/625] eta 0:03:52 lr 0.000179 wd 0.0500 time 0.5847 (0.6374) data time 0.0008 (0.0044) model time 0.5840 (0.5902) loss 7.5982 (7.1896) grad_norm 2.9362 (2.5879) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:46:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][270/625] eta 0:03:45 lr 0.000179 wd 0.0500 time 0.5867 (0.6355) data time 0.0010 (0.0043) model time 0.5857 (0.5900) loss 6.6192 (7.1783) grad_norm 3.5001 (2.5986) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:46:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][280/625] eta 0:03:38 lr 0.000179 wd 0.0500 time 0.7227 (0.6342) data time 0.0009 (0.0041) model time 0.7217 (0.5904) loss 6.0930 (7.1668) grad_norm 1.9524 (2.5778) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:46:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][290/625] eta 0:03:32 lr 0.000179 wd 0.0500 time 0.5952 (0.6334) data time 0.0010 (0.0040) model time 0.5942 (0.5913) loss 6.8985 (7.1736) grad_norm 1.6303 (2.5705) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:46:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][300/625] eta 0:03:25 lr 0.000179 wd 0.0500 time 0.5931 (0.6320) data time 0.0010 (0.0039) model time 0.5921 (0.5913) loss 6.5511 (7.1732) grad_norm 2.1397 (2.6072) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:46:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][310/625] eta 0:03:18 lr 0.000179 wd 0.0500 time 0.5933 (0.6308) data time 0.0010 (0.0038) model time 0.5923 (0.5914) loss 8.7647 (7.1708) grad_norm 1.9390 (2.5871) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:46:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][320/625] eta 0:03:11 lr 0.000179 wd 0.0500 time 0.5903 (0.6295) data time 0.0009 (0.0038) model time 0.5894 (0.5913) loss 9.2572 (7.1768) grad_norm 3.0310 (2.5860) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:46:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][330/625] eta 0:03:05 lr 0.000179 wd 0.0500 time 0.5901 (0.6282) data time 0.0010 (0.0037) model time 0.5891 (0.5911) loss 7.2967 (7.1908) grad_norm 2.1473 (2.6082) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:46:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][340/625] eta 0:02:58 lr 0.000179 wd 0.0500 time 0.5834 (0.6269) data time 0.0010 (0.0036) model time 0.5824 (0.5909) loss 7.8023 (7.1929) grad_norm 3.8495 (2.6226) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:46:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][350/625] eta 0:02:52 lr 0.000179 wd 0.0500 time 0.5878 (0.6257) data time 0.0007 (0.0035) model time 0.5870 (0.5907) loss 7.4279 (7.2033) grad_norm 2.0756 (2.6179) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:47:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][360/625] eta 0:02:45 lr 0.000179 wd 0.0500 time 0.5894 (0.6246) data time 0.0007 (0.0034) model time 0.5887 (0.5906) loss 7.8398 (7.2006) grad_norm 1.9613 (2.6238) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:47:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][370/625] eta 0:02:39 lr 0.000179 wd 0.0500 time 0.5931 (0.6237) data time 0.0008 (0.0034) model time 0.5923 (0.5906) loss 7.9680 (7.1979) grad_norm 3.0159 (2.6265) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:47:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][380/625] eta 0:02:32 lr 0.000178 wd 0.0500 time 0.6032 (0.6230) data time 0.0010 (0.0033) model time 0.6022 (0.5907) loss 7.2911 (7.1992) grad_norm 2.2179 (2.6251) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:47:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][390/625] eta 0:02:26 lr 0.000178 wd 0.0500 time 0.5939 (0.6222) data time 0.0008 (0.0032) model time 0.5931 (0.5908) loss 7.5911 (7.1923) grad_norm 2.1385 (2.6100) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:47:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][400/625] eta 0:02:19 lr 0.000178 wd 0.0500 time 0.5864 (0.6214) data time 0.0008 (0.0032) model time 0.5856 (0.5907) loss 6.2991 (7.1830) grad_norm 2.6726 (2.6071) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:47:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][410/625] eta 0:02:13 lr 0.000178 wd 0.0500 time 0.5929 (0.6205) data time 0.0008 (0.0031) model time 0.5921 (0.5906) loss 6.7325 (7.1853) grad_norm 3.2426 (2.6493) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:47:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][420/625] eta 0:02:07 lr 0.000178 wd 0.0500 time 0.5866 (0.6197) data time 0.0011 (0.0031) model time 0.5855 (0.5904) loss 7.3298 (7.1858) grad_norm 2.7482 (2.6595) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:47:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][430/625] eta 0:02:00 lr 0.000178 wd 0.0500 time 0.5863 (0.6189) data time 0.0008 (0.0030) model time 0.5855 (0.5903) loss 6.4203 (7.1846) grad_norm 3.1883 (2.6638) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:47:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][440/625] eta 0:01:54 lr 0.000178 wd 0.0500 time 0.5902 (0.6183) data time 0.0009 (0.0030) model time 0.5893 (0.5903) loss 6.7943 (7.1920) grad_norm 2.2827 (2.6668) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:47:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][450/625] eta 0:01:48 lr 0.000178 wd 0.0500 time 0.5994 (0.6177) data time 0.0008 (0.0030) model time 0.5986 (0.5903) loss 7.2134 (7.1996) grad_norm 2.0422 (2.6578) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:48:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][460/625] eta 0:01:41 lr 0.000178 wd 0.0500 time 0.5918 (0.6171) data time 0.0011 (0.0029) model time 0.5907 (0.5903) loss 6.3714 (7.1893) grad_norm 3.5628 (2.6589) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:48:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][470/625] eta 0:01:35 lr 0.000178 wd 0.0500 time 0.5838 (0.6165) data time 0.0008 (0.0029) model time 0.5830 (0.5902) loss 6.8313 (7.1870) grad_norm 2.9105 (2.6611) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:48:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][480/625] eta 0:01:29 lr 0.000178 wd 0.0500 time 0.5852 (0.6158) data time 0.0010 (0.0028) model time 0.5842 (0.5901) loss 7.6846 (7.1767) grad_norm 2.4692 (2.6655) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:48:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][490/625] eta 0:01:23 lr 0.000178 wd 0.0500 time 0.5935 (0.6152) data time 0.0008 (0.0028) model time 0.5927 (0.5900) loss 6.7190 (7.1696) grad_norm 2.2593 (2.6680) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:48:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][500/625] eta 0:01:16 lr 0.000178 wd 0.0500 time 0.5873 (0.6150) data time 0.0010 (0.0028) model time 0.5863 (0.5903) loss 7.2422 (7.1755) grad_norm 2.4135 (2.6678) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:48:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][510/625] eta 0:01:10 lr 0.000178 wd 0.0500 time 0.5959 (0.6148) data time 0.0010 (0.0027) model time 0.5949 (0.5906) loss 7.3833 (7.1719) grad_norm 1.7329 (2.6689) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:48:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][520/625] eta 0:01:04 lr 0.000177 wd 0.0500 time 0.5921 (0.6143) data time 0.0009 (0.0027) model time 0.5913 (0.5906) loss 7.6868 (7.1820) grad_norm 3.9431 (2.6913) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:48:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][530/625] eta 0:00:58 lr 0.000177 wd 0.0500 time 0.5931 (0.6139) data time 0.0008 (0.0027) model time 0.5923 (0.5906) loss 6.8859 (7.1808) grad_norm 2.0744 (2.6863) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:48:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][540/625] eta 0:00:52 lr 0.000177 wd 0.0500 time 0.5937 (0.6135) data time 0.0010 (0.0026) model time 0.5927 (0.5906) loss 7.5577 (7.1792) grad_norm 1.7244 (2.6812) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:48:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][550/625] eta 0:00:45 lr 0.000177 wd 0.0500 time 0.5902 (0.6130) data time 0.0009 (0.0026) model time 0.5893 (0.5905) loss 5.5594 (7.1717) grad_norm 3.6636 (2.6776) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:49:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][560/625] eta 0:00:39 lr 0.000177 wd 0.0500 time 0.5905 (0.6125) data time 0.0008 (0.0026) model time 0.5897 (0.5904) loss 8.0045 (7.1707) grad_norm 2.2812 (2.6733) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:49:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][570/625] eta 0:00:33 lr 0.000177 wd 0.0500 time 0.5810 (0.6120) data time 0.0008 (0.0025) model time 0.5803 (0.5903) loss 7.0467 (7.1787) grad_norm 1.9978 (2.6665) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 20:49:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-27 20:49:09 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 20:49:12 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 21:05:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 21:05:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 21:05:36 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 21:05:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-27 21:05:48 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-27 21:05:49 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-27 21:05:49 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-27 21:05:49 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 231) +[2024-07-27 21:05:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-27 21:06:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][580/625] eta 0:02:37 lr 0.000177 wd 0.0500 time 0.6040 (3.5054) data time 0.0010 (0.1631) model time 0.6030 (3.3423) loss 7.7732 (7.3529) grad_norm 1.8727 (2.0739) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:06:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][590/625] eta 0:00:54 lr 0.000177 wd 0.0500 time 0.6006 (1.5702) data time 0.0011 (0.0551) model time 0.5994 (1.5151) loss 8.1391 (7.3006) grad_norm 1.7506 (2.2248) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:06:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][600/625] eta 0:00:29 lr 0.000177 wd 0.0500 time 0.5960 (1.1822) data time 0.0011 (0.0336) model time 0.5949 (1.1486) loss 7.7344 (7.3269) grad_norm 2.5293 (2.5742) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:06:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][610/625] eta 0:00:15 lr 0.000177 wd 0.0500 time 0.5927 (1.0147) data time 0.0008 (0.0244) model time 0.5918 (0.9903) loss 6.9040 (7.3260) grad_norm 5.0631 (2.8956) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:06:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [231/300][620/625] eta 0:00:04 lr 0.000177 wd 0.0500 time 0.5938 (0.9220) data time 0.0008 (0.0192) model time 0.5930 (0.9028) loss 7.2005 (7.2345) grad_norm 2.5006 (3.0807) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:06:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 231 training takes 0:00:43 +[2024-07-27 21:06:38 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 21:06:42 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 21:06:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.519 (0.519) Loss 0.5044 (0.5044) Acc@1 90.283 (90.283) Acc@5 98.926 (98.926) Mem 22344MB +[2024-07-27 21:06:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.161) Loss 0.7539 (0.6133) Acc@1 82.812 (87.695) Acc@5 96.826 (98.047) Mem 22344MB +[2024-07-27 21:06:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.144) Loss 0.8481 (0.7023) Acc@1 80.127 (85.063) Acc@5 95.996 (97.207) Mem 22344MB +[2024-07-27 21:06:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.655 Acc@5 97.193 +[2024-07-27 21:06:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.7% +[2024-07-27 21:06:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.900 (0.900) Loss 0.5059 (0.5059) Acc@1 90.527 (90.527) Acc@5 99.023 (99.023) Mem 22344MB +[2024-07-27 21:06:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.195) Loss 0.7461 (0.6157) Acc@1 83.301 (87.815) Acc@5 96.875 (98.082) Mem 22344MB +[2024-07-27 21:06:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.162) Loss 0.8501 (0.7027) Acc@1 79.688 (84.977) Acc@5 96.191 (97.280) Mem 22344MB +[2024-07-27 21:06:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.583 Acc@5 97.263 +[2024-07-27 21:06:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.6% +[2024-07-27 21:06:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.58% +[2024-07-27 21:06:52 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-27 21:06:55 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-27 21:06:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][0/625] eta 0:11:26 lr 0.000177 wd 0.0500 time 1.0987 (1.0987) data time 0.3874 (0.3874) model time 0.0000 (0.0000) loss 6.8599 (6.8599) grad_norm 2.7497 (2.7497) loss_scale 256.0000 (256.0000) mem 22337MB +[2024-07-27 21:07:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][10/625] eta 0:06:47 lr 0.000177 wd 0.0500 time 0.5933 (0.6632) data time 0.0008 (0.0362) model time 0.0000 (0.0000) loss 7.2715 (6.8395) grad_norm 2.0123 (3.9198) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:07:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][20/625] eta 0:06:23 lr 0.000177 wd 0.0500 time 0.6015 (0.6339) data time 0.0008 (0.0195) model time 0.0000 (0.0000) loss 6.3711 (7.0316) grad_norm 3.4908 (3.3086) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:07:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][30/625] eta 0:06:10 lr 0.000176 wd 0.0500 time 0.5973 (0.6225) data time 0.0011 (0.0136) model time 0.0000 (0.0000) loss 7.8403 (7.0529) grad_norm 1.6967 (3.0041) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:07:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][40/625] eta 0:06:00 lr 0.000176 wd 0.0500 time 0.5999 (0.6168) data time 0.0008 (0.0105) model time 0.0000 (0.0000) loss 7.4805 (7.0540) grad_norm 2.2205 (2.8521) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:07:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][50/625] eta 0:05:52 lr 0.000176 wd 0.0500 time 0.5997 (0.6135) data time 0.0011 (0.0087) model time 0.0000 (0.0000) loss 7.8431 (7.0600) grad_norm 2.1000 (2.7495) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:07:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][60/625] eta 0:05:45 lr 0.000176 wd 0.0500 time 0.6155 (0.6114) data time 0.0010 (0.0074) model time 0.6145 (0.5998) loss 6.8578 (7.0239) grad_norm 2.1435 (2.6739) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:07:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][70/625] eta 0:05:38 lr 0.000176 wd 0.0500 time 0.5992 (0.6099) data time 0.0008 (0.0066) model time 0.5984 (0.5995) loss 7.2813 (7.0970) grad_norm 2.6212 (2.6187) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:07:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][80/625] eta 0:05:31 lr 0.000176 wd 0.0500 time 0.5976 (0.6088) data time 0.0008 (0.0059) model time 0.5968 (0.5997) loss 7.3679 (7.0764) grad_norm 2.2527 (2.6293) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:07:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][90/625] eta 0:05:25 lr 0.000176 wd 0.0500 time 0.6021 (0.6079) data time 0.0009 (0.0054) model time 0.6013 (0.5996) loss 6.1030 (7.0919) grad_norm 3.4700 (2.5988) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:07:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][100/625] eta 0:05:18 lr 0.000176 wd 0.0500 time 0.5990 (0.6073) data time 0.0011 (0.0050) model time 0.5980 (0.5998) loss 8.3594 (7.1294) grad_norm 1.4475 (2.5414) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:08:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][110/625] eta 0:05:12 lr 0.000176 wd 0.0500 time 0.5962 (0.6065) data time 0.0010 (0.0046) model time 0.5951 (0.5994) loss 7.2069 (7.1341) grad_norm 2.8112 (2.5118) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:08:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-27 21:08:08 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 21:08:09 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 21:09:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 21:09:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 21:10:09 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 21:10:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-27 21:10:18 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-27 21:10:19 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-27 21:10:19 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-27 21:10:19 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 232) +[2024-07-27 21:10:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-27 21:10:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][120/625] eta 0:43:58 lr 0.000176 wd 0.0500 time 1.6839 (5.2248) data time 0.0009 (0.3678) model time 1.6830 (4.8570) loss 7.7346 (7.5881) grad_norm 2.4930 (2.4652) loss_scale 256.0000 (256.0000) mem 22342MB +[2024-07-27 21:10:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][130/625] eta 0:11:07 lr 0.000176 wd 0.0500 time 0.5741 (1.3490) data time 0.0007 (0.0620) model time 0.5734 (1.2871) loss 6.3983 (7.3831) grad_norm 2.2558 (2.2569) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:10:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][140/625] eta 0:08:03 lr 0.000176 wd 0.0500 time 0.5740 (0.9976) data time 0.0008 (0.0342) model time 0.5732 (0.9635) loss 7.7575 (7.3663) grad_norm 1.7969 (2.3138) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:10:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][150/625] eta 0:06:50 lr 0.000176 wd 0.0500 time 0.5725 (0.8650) data time 0.0007 (0.0237) model time 0.5719 (0.8413) loss 6.8852 (7.4013) grad_norm 1.6900 (2.2349) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:10:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][160/625] eta 0:06:10 lr 0.000175 wd 0.0500 time 0.5758 (0.7957) data time 0.0008 (0.0183) model time 0.5750 (0.7774) loss 7.1172 (7.3134) grad_norm 1.8582 (2.3753) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:11:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][170/625] eta 0:05:43 lr 0.000175 wd 0.0500 time 0.5183 (0.7554) data time 0.0007 (0.0149) model time 0.5176 (0.7405) loss 6.6357 (7.2764) grad_norm 2.0584 (2.3304) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:11:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][180/625] eta 0:05:24 lr 0.000175 wd 0.0500 time 0.5782 (0.7298) data time 0.0006 (0.0126) model time 0.5775 (0.7172) loss 7.1347 (7.2216) grad_norm 1.7826 (2.4065) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:11:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][190/625] eta 0:05:08 lr 0.000175 wd 0.0500 time 0.5813 (0.7089) data time 0.0008 (0.0110) model time 0.5805 (0.6979) loss 7.0667 (7.1492) grad_norm 11.0368 (2.7513) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:11:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][200/625] eta 0:04:54 lr 0.000175 wd 0.0500 time 0.5802 (0.6930) data time 0.0008 (0.0098) model time 0.5794 (0.6832) loss 7.2998 (7.1505) grad_norm 2.3685 (2.7328) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:11:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][210/625] eta 0:04:42 lr 0.000175 wd 0.0500 time 0.5799 (0.6807) data time 0.0006 (0.0088) model time 0.5793 (0.6719) loss 6.2529 (7.1598) grad_norm 2.8018 (2.7279) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:11:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][220/625] eta 0:04:31 lr 0.000175 wd 0.0500 time 0.5756 (0.6706) data time 0.0006 (0.0080) model time 0.5749 (0.6626) loss 7.7825 (7.2039) grad_norm 2.5446 (2.7021) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:11:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][230/625] eta 0:04:21 lr 0.000175 wd 0.0500 time 0.5785 (0.6622) data time 0.0008 (0.0073) model time 0.5777 (0.6548) loss 7.6253 (7.2027) grad_norm 1.7144 (2.6688) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:11:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][240/625] eta 0:04:12 lr 0.000175 wd 0.0500 time 0.5815 (0.6552) data time 0.0006 (0.0068) model time 0.5809 (0.6484) loss 6.7612 (7.2015) grad_norm 3.1951 (2.6402) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:11:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][250/625] eta 0:04:03 lr 0.000175 wd 0.0500 time 0.5808 (0.6493) data time 0.0008 (0.0064) model time 0.5800 (0.6429) loss 7.1606 (7.1714) grad_norm 1.8810 (2.6315) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:11:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][260/625] eta 0:03:55 lr 0.000175 wd 0.0500 time 0.5800 (0.6445) data time 0.0008 (0.0060) model time 0.5792 (0.6385) loss 7.7558 (7.1751) grad_norm 2.6429 (2.6460) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:12:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][270/625] eta 0:03:47 lr 0.000175 wd 0.0500 time 0.5799 (0.6403) data time 0.0009 (0.0056) model time 0.5791 (0.6347) loss 7.6688 (7.1697) grad_norm 2.4042 (2.6399) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:12:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][280/625] eta 0:03:39 lr 0.000175 wd 0.0500 time 0.5836 (0.6367) data time 0.0009 (0.0053) model time 0.5827 (0.6314) loss 7.9029 (7.1794) grad_norm 2.3894 (2.6094) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:12:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][290/625] eta 0:03:32 lr 0.000175 wd 0.0500 time 0.5772 (0.6335) data time 0.0008 (0.0051) model time 0.5764 (0.6284) loss 7.0250 (7.1772) grad_norm 3.5908 (2.5902) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:12:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][300/625] eta 0:03:24 lr 0.000174 wd 0.0500 time 0.5820 (0.6305) data time 0.0008 (0.0048) model time 0.5812 (0.6257) loss 7.7586 (7.1765) grad_norm 3.8709 (2.7380) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:12:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][310/625] eta 0:03:17 lr 0.000174 wd 0.0500 time 0.5792 (0.6279) data time 0.0008 (0.0046) model time 0.5784 (0.6232) loss 8.0172 (7.1726) grad_norm 2.7077 (2.7205) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:12:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][320/625] eta 0:03:10 lr 0.000174 wd 0.0500 time 0.5735 (0.6254) data time 0.0006 (0.0044) model time 0.5729 (0.6210) loss 7.3424 (7.1576) grad_norm 2.0980 (2.6957) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:12:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-27 21:12:30 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 21:12:35 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 21:16:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 21:16:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 21:16:35 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 21:16:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-27 21:16:46 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-27 21:16:46 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-27 21:16:46 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-27 21:16:47 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 232) +[2024-07-27 21:16:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-27 21:17:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][330/625] eta 0:07:11 lr 0.000174 wd 0.0500 time 0.5693 (1.4634) data time 0.0009 (0.0715) model time 0.5683 (1.3919) loss 8.2424 (7.4177) grad_norm 2.3230 (3.0788) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:17:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][340/625] eta 0:04:49 lr 0.000174 wd 0.0500 time 0.5657 (1.0159) data time 0.0007 (0.0362) model time 0.5650 (0.9797) loss 7.5840 (7.2654) grad_norm 2.5886 (2.9410) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:17:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][350/625] eta 0:03:58 lr 0.000174 wd 0.0500 time 0.5688 (0.8663) data time 0.0009 (0.0245) model time 0.5679 (0.8418) loss 7.7959 (7.3399) grad_norm 2.2554 (2.7313) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:17:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][360/625] eta 0:03:29 lr 0.000174 wd 0.0500 time 0.5694 (0.7918) data time 0.0006 (0.0186) model time 0.5688 (0.7732) loss 6.1636 (7.2938) grad_norm 3.0034 (2.7093) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:17:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][370/625] eta 0:03:10 lr 0.000174 wd 0.0500 time 0.5702 (0.7472) data time 0.0009 (0.0150) model time 0.5693 (0.7322) loss 6.9728 (7.2540) grad_norm 2.9049 (2.6511) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:17:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][380/625] eta 0:02:57 lr 0.000174 wd 0.0500 time 0.5722 (0.7239) data time 0.0007 (0.0127) model time 0.5715 (0.7112) loss 7.1589 (7.2147) grad_norm 1.7752 (2.6389) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:17:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][390/625] eta 0:02:45 lr 0.000174 wd 0.0500 time 0.5863 (0.7025) data time 0.0006 (0.0110) model time 0.5857 (0.6915) loss 6.7317 (7.1793) grad_norm 2.9762 (2.6400) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:17:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][400/625] eta 0:02:34 lr 0.000174 wd 0.0500 time 0.5743 (0.6862) data time 0.0008 (0.0097) model time 0.5735 (0.6765) loss 8.2270 (7.1886) grad_norm 2.5835 (2.6917) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:17:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][410/625] eta 0:02:24 lr 0.000174 wd 0.0500 time 0.5773 (0.6737) data time 0.0008 (0.0087) model time 0.5765 (0.6650) loss 7.4217 (7.1536) grad_norm 1.8802 (2.6229) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:17:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][420/625] eta 0:02:16 lr 0.000174 wd 0.0500 time 0.5717 (0.6637) data time 0.0009 (0.0079) model time 0.5708 (0.6557) loss 8.2893 (7.1896) grad_norm 2.7234 (2.7743) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:18:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][430/625] eta 0:02:07 lr 0.000174 wd 0.0500 time 0.5762 (0.6555) data time 0.0009 (0.0073) model time 0.5753 (0.6482) loss 6.5932 (7.1970) grad_norm 1.9855 (2.8089) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:18:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][440/625] eta 0:02:00 lr 0.000173 wd 0.0500 time 0.5722 (0.6488) data time 0.0006 (0.0068) model time 0.5715 (0.6420) loss 7.3708 (7.2101) grad_norm 4.7708 (2.8564) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:18:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][450/625] eta 0:01:52 lr 0.000173 wd 0.0500 time 0.5753 (0.6430) data time 0.0007 (0.0063) model time 0.5746 (0.6367) loss 6.7054 (7.1943) grad_norm 2.5329 (2.8250) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:18:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][460/625] eta 0:01:45 lr 0.000173 wd 0.0500 time 0.5760 (0.6383) data time 0.0007 (0.0059) model time 0.5753 (0.6324) loss 6.2753 (7.1921) grad_norm 2.1958 (2.7805) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:18:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][470/625] eta 0:01:38 lr 0.000173 wd 0.0500 time 0.5762 (0.6342) data time 0.0008 (0.0056) model time 0.5754 (0.6286) loss 8.5138 (7.1929) grad_norm 3.4067 (2.7482) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:18:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][480/625] eta 0:01:31 lr 0.000173 wd 0.0500 time 0.5847 (0.6306) data time 0.0022 (0.0053) model time 0.5825 (0.6253) loss 8.1262 (7.2048) grad_norm 3.3778 (2.7426) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:18:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][490/625] eta 0:01:24 lr 0.000173 wd 0.0500 time 0.5741 (0.6274) data time 0.0007 (0.0051) model time 0.5734 (0.6223) loss 5.8388 (7.2082) grad_norm 3.1029 (2.7749) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:18:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][500/625] eta 0:01:18 lr 0.000173 wd 0.0500 time 0.5749 (0.6245) data time 0.0006 (0.0049) model time 0.5743 (0.6196) loss 6.3706 (7.1863) grad_norm 3.7565 (2.7843) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:18:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][510/625] eta 0:01:11 lr 0.000173 wd 0.0500 time 0.5775 (0.6219) data time 0.0007 (0.0046) model time 0.5768 (0.6172) loss 6.1715 (7.1832) grad_norm 2.1794 (2.7771) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:18:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][520/625] eta 0:01:05 lr 0.000173 wd 0.0500 time 0.5750 (0.6196) data time 0.0009 (0.0045) model time 0.5742 (0.6151) loss 7.2990 (7.1481) grad_norm 2.0061 (2.7533) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:19:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][530/625] eta 0:00:58 lr 0.000173 wd 0.0500 time 0.5783 (0.6175) data time 0.0006 (0.0043) model time 0.5777 (0.6133) loss 7.2984 (7.1400) grad_norm 1.5998 (2.7307) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:19:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][540/625] eta 0:00:52 lr 0.000173 wd 0.0500 time 0.5764 (0.6157) data time 0.0008 (0.0041) model time 0.5756 (0.6115) loss 7.0918 (7.1334) grad_norm 2.1985 (2.7203) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:19:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][550/625] eta 0:00:46 lr 0.000173 wd 0.0500 time 0.5784 (0.6140) data time 0.0009 (0.0040) model time 0.5775 (0.6100) loss 8.2785 (7.1414) grad_norm 3.6312 (2.7436) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:19:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][560/625] eta 0:00:39 lr 0.000173 wd 0.0500 time 0.5749 (0.6124) data time 0.0008 (0.0038) model time 0.5741 (0.6085) loss 8.2978 (7.1433) grad_norm 2.4307 (2.7931) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:19:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][570/625] eta 0:00:33 lr 0.000172 wd 0.0500 time 0.5733 (0.6108) data time 0.0006 (0.0037) model time 0.5727 (0.6071) loss 5.7664 (7.1255) grad_norm 1.9507 (2.7723) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:19:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][580/625] eta 0:00:27 lr 0.000172 wd 0.0500 time 0.5757 (0.6094) data time 0.0008 (0.0036) model time 0.5749 (0.6057) loss 5.9055 (7.1153) grad_norm 1.6806 (2.7536) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:19:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][590/625] eta 0:00:21 lr 0.000172 wd 0.0500 time 0.5766 (0.6080) data time 0.0007 (0.0035) model time 0.5759 (0.6045) loss 7.3168 (7.1074) grad_norm 1.7709 (2.7327) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:19:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][600/625] eta 0:00:15 lr 0.000172 wd 0.0500 time 0.5853 (0.6079) data time 0.0010 (0.0034) model time 0.5843 (0.6045) loss 7.8814 (7.1179) grad_norm 2.7327 (2.7361) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:19:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][610/625] eta 0:00:09 lr 0.000172 wd 0.0500 time 0.5759 (0.6068) data time 0.0007 (0.0033) model time 0.5752 (0.6034) loss 6.1068 (7.1115) grad_norm 2.4643 (2.7146) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:19:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [232/300][620/625] eta 0:00:03 lr 0.000172 wd 0.0500 time 0.5742 (0.6057) data time 0.0004 (0.0033) model time 0.5738 (0.6025) loss 5.9609 (7.0971) grad_norm 2.5122 (2.7381) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 21:19:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 232 training takes 0:03:04 +[2024-07-27 21:19:55 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 21:20:00 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 21:20:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.483 (0.483) Loss 0.4995 (0.4995) Acc@1 90.137 (90.137) Acc@5 98.975 (98.975) Mem 22341MB +[2024-07-27 21:20:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.158) Loss 0.7480 (0.6116) Acc@1 82.959 (87.713) Acc@5 96.875 (98.105) Mem 22341MB +[2024-07-27 21:20:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8442 (0.7005) Acc@1 80.029 (84.938) Acc@5 96.338 (97.240) Mem 22341MB +[2024-07-27 21:20:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.591 Acc@5 97.229 +[2024-07-27 21:20:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.6% +[2024-07-27 21:20:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.895 (0.895) Loss 0.5059 (0.5059) Acc@1 90.479 (90.479) Acc@5 98.975 (98.975) Mem 22341MB +[2024-07-27 21:20:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.124 (0.195) Loss 0.7451 (0.6155) Acc@1 83.252 (87.824) Acc@5 96.924 (98.087) Mem 22341MB +[2024-07-27 21:20:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.162) Loss 0.8496 (0.7026) Acc@1 79.688 (84.991) Acc@5 96.191 (97.287) Mem 22341MB +[2024-07-27 21:20:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.595 Acc@5 97.271 +[2024-07-27 21:20:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.6% +[2024-07-27 21:20:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.59% +[2024-07-27 21:20:09 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-27 21:20:13 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-27 21:20:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][0/625] eta 0:12:07 lr 0.000172 wd 0.0500 time 1.1636 (1.1636) data time 0.4360 (0.4360) model time 0.0000 (0.0000) loss 6.9162 (6.9162) grad_norm 1.9016 (1.9016) loss_scale 256.0000 (256.0000) mem 22337MB +[2024-07-27 21:20:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][10/625] eta 0:06:25 lr 0.000172 wd 0.0500 time 0.5761 (0.6273) data time 0.0009 (0.0405) model time 0.0000 (0.0000) loss 6.8806 (7.3238) grad_norm 1.7346 (2.1238) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:20:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][20/625] eta 0:06:03 lr 0.000172 wd 0.0500 time 0.5714 (0.6010) data time 0.0007 (0.0216) model time 0.0000 (0.0000) loss 8.2615 (7.3570) grad_norm 3.1491 (3.0586) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:20:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][30/625] eta 0:05:51 lr 0.000172 wd 0.0500 time 0.5683 (0.5916) data time 0.0009 (0.0150) model time 0.0000 (0.0000) loss 7.0917 (7.2349) grad_norm 2.4314 (2.7466) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:20:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][40/625] eta 0:05:43 lr 0.000172 wd 0.0500 time 0.5717 (0.5869) data time 0.0007 (0.0116) model time 0.0000 (0.0000) loss 5.7791 (7.2703) grad_norm 1.7432 (2.5654) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:20:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][50/625] eta 0:05:35 lr 0.000172 wd 0.0500 time 0.5734 (0.5838) data time 0.0008 (0.0095) model time 0.0000 (0.0000) loss 6.5010 (7.2200) grad_norm 4.0962 (2.5469) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:20:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][60/625] eta 0:05:29 lr 0.000172 wd 0.0500 time 0.5752 (0.5824) data time 0.0010 (0.0081) model time 0.5742 (0.5744) loss 6.8309 (7.1781) grad_norm 1.9473 (2.5321) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:20:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][70/625] eta 0:05:22 lr 0.000172 wd 0.0500 time 0.5774 (0.5813) data time 0.0006 (0.0071) model time 0.5768 (0.5741) loss 6.9444 (7.1786) grad_norm 2.0385 (2.4940) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:21:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][80/625] eta 0:05:16 lr 0.000171 wd 0.0500 time 0.5733 (0.5805) data time 0.0006 (0.0063) model time 0.5727 (0.5739) loss 5.5269 (7.1085) grad_norm 1.9797 (2.4838) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:21:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][90/625] eta 0:05:10 lr 0.000171 wd 0.0500 time 0.5731 (0.5797) data time 0.0008 (0.0057) model time 0.5722 (0.5736) loss 7.3253 (7.1309) grad_norm 2.9607 (2.4652) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:21:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][100/625] eta 0:05:03 lr 0.000171 wd 0.0500 time 0.5717 (0.5790) data time 0.0007 (0.0052) model time 0.5710 (0.5733) loss 6.9785 (7.1548) grad_norm 2.1856 (2.5432) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:21:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][110/625] eta 0:04:58 lr 0.000171 wd 0.0500 time 0.5716 (0.5789) data time 0.0009 (0.0048) model time 0.5707 (0.5738) loss 7.8023 (7.1635) grad_norm 1.9950 (2.5275) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:21:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][120/625] eta 0:04:52 lr 0.000171 wd 0.0500 time 0.5746 (0.5785) data time 0.0006 (0.0045) model time 0.5740 (0.5738) loss 7.7214 (7.1568) grad_norm 1.8852 (2.4871) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:21:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][130/625] eta 0:04:46 lr 0.000171 wd 0.0500 time 0.6024 (0.5797) data time 0.0007 (0.0042) model time 0.6017 (0.5762) loss 7.2242 (7.1820) grad_norm 1.9948 (2.4697) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:21:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][140/625] eta 0:04:41 lr 0.000171 wd 0.0500 time 0.5748 (0.5795) data time 0.0010 (0.0040) model time 0.5738 (0.5761) loss 6.4987 (7.1760) grad_norm 2.4299 (2.4610) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:21:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][150/625] eta 0:04:35 lr 0.000171 wd 0.0500 time 0.5761 (0.5793) data time 0.0007 (0.0038) model time 0.5754 (0.5761) loss 6.4099 (7.1423) grad_norm 3.1501 (2.4683) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:21:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][160/625] eta 0:04:29 lr 0.000171 wd 0.0500 time 0.5828 (0.5791) data time 0.0006 (0.0036) model time 0.5822 (0.5760) loss 6.2319 (7.1172) grad_norm 5.5917 (2.5212) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:21:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][170/625] eta 0:04:23 lr 0.000171 wd 0.0500 time 0.5733 (0.5788) data time 0.0006 (0.0034) model time 0.5727 (0.5758) loss 6.2490 (7.1094) grad_norm 3.6307 (2.5779) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:21:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][180/625] eta 0:04:17 lr 0.000171 wd 0.0500 time 0.5723 (0.5785) data time 0.0006 (0.0033) model time 0.5717 (0.5755) loss 8.4479 (7.1161) grad_norm 4.6063 (2.5890) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:22:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][190/625] eta 0:04:11 lr 0.000171 wd 0.0500 time 0.5759 (0.5791) data time 0.0006 (0.0032) model time 0.5753 (0.5765) loss 6.7038 (7.1205) grad_norm 2.3362 (2.5935) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:22:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][200/625] eta 0:04:06 lr 0.000171 wd 0.0500 time 0.5728 (0.5798) data time 0.0008 (0.0031) model time 0.5720 (0.5775) loss 8.7150 (7.1254) grad_norm 2.1286 (2.5942) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:22:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][210/625] eta 0:04:00 lr 0.000171 wd 0.0500 time 0.5766 (0.5796) data time 0.0009 (0.0030) model time 0.5757 (0.5774) loss 6.4794 (7.1539) grad_norm 2.4959 (2.6363) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:22:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][220/625] eta 0:03:54 lr 0.000170 wd 0.0500 time 0.5808 (0.5794) data time 0.0007 (0.0029) model time 0.5802 (0.5772) loss 8.1810 (7.1370) grad_norm 2.6740 (2.6281) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:22:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][230/625] eta 0:03:48 lr 0.000170 wd 0.0500 time 0.5763 (0.5793) data time 0.0007 (0.0028) model time 0.5756 (0.5771) loss 6.3883 (7.1162) grad_norm 2.6187 (2.6135) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:22:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-27 21:22:30 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 21:22:31 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 21:24:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 21:24:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 21:24:56 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 21:25:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-27 21:25:10 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-27 21:25:10 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-27 21:25:10 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-27 21:25:10 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 233) +[2024-07-27 21:25:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-27 21:25:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][240/625] eta 0:21:20 lr 0.000170 wd 0.0500 time 0.6185 (3.3267) data time 0.0008 (0.1616) model time 0.6177 (3.1651) loss 7.3870 (7.3414) grad_norm 3.2677 (2.5892) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:25:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][250/625] eta 0:08:39 lr 0.000170 wd 0.0500 time 0.6122 (1.3851) data time 0.0008 (0.0469) model time 0.6114 (1.3382) loss 7.6709 (7.4463) grad_norm 4.2109 (3.1917) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:25:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][260/625] eta 0:06:27 lr 0.000170 wd 0.0500 time 0.6022 (1.0617) data time 0.0011 (0.0281) model time 0.6011 (1.0337) loss 7.4323 (7.4709) grad_norm 2.1224 (3.1168) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:25:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][270/625] eta 0:05:29 lr 0.000170 wd 0.0500 time 0.6037 (0.9268) data time 0.0008 (0.0201) model time 0.6028 (0.9067) loss 6.1280 (7.4529) grad_norm 3.4138 (2.8917) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:25:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][280/625] eta 0:04:54 lr 0.000170 wd 0.0500 time 0.6085 (0.8533) data time 0.0008 (0.0158) model time 0.6077 (0.8375) loss 6.4575 (7.3679) grad_norm 4.3122 (2.7950) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:25:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][290/625] eta 0:04:31 lr 0.000170 wd 0.0500 time 0.6028 (0.8099) data time 0.0008 (0.0131) model time 0.6020 (0.7968) loss 7.6708 (7.3310) grad_norm 2.1933 (2.7241) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:26:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][300/625] eta 0:04:14 lr 0.000170 wd 0.0500 time 0.6111 (0.7817) data time 0.0008 (0.0112) model time 0.6102 (0.7705) loss 7.7401 (7.2781) grad_norm 3.2492 (2.7481) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:26:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][310/625] eta 0:03:59 lr 0.000170 wd 0.0500 time 0.6156 (0.7588) data time 0.0008 (0.0099) model time 0.6147 (0.7490) loss 6.7497 (7.2365) grad_norm 3.9316 (3.2283) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:26:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][320/625] eta 0:03:46 lr 0.000170 wd 0.0500 time 0.6138 (0.7414) data time 0.0010 (0.0088) model time 0.6127 (0.7326) loss 7.0269 (7.2083) grad_norm 2.0733 (3.1483) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:26:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][330/625] eta 0:03:34 lr 0.000170 wd 0.0500 time 0.6123 (0.7279) data time 0.0010 (0.0080) model time 0.6112 (0.7199) loss 6.6156 (7.1897) grad_norm 2.0311 (3.0633) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:26:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][340/625] eta 0:03:24 lr 0.000170 wd 0.0500 time 0.6047 (0.7163) data time 0.0010 (0.0073) model time 0.6037 (0.7090) loss 7.9716 (7.2557) grad_norm 2.3762 (3.0339) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:26:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][350/625] eta 0:03:14 lr 0.000170 wd 0.0500 time 0.6115 (0.7067) data time 0.0010 (0.0068) model time 0.6105 (0.6999) loss 7.9863 (7.2585) grad_norm 5.8710 (2.9687) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:26:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][360/625] eta 0:03:05 lr 0.000169 wd 0.0500 time 0.6275 (0.6990) data time 0.0011 (0.0063) model time 0.6264 (0.6926) loss 6.3839 (7.2357) grad_norm 2.9227 (2.9115) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:26:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][370/625] eta 0:02:56 lr 0.000169 wd 0.0500 time 0.6180 (0.6923) data time 0.0010 (0.0059) model time 0.6170 (0.6863) loss 8.5647 (7.2479) grad_norm 2.5074 (2.8786) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:26:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][380/625] eta 0:02:48 lr 0.000169 wd 0.0500 time 0.6154 (0.6868) data time 0.0007 (0.0056) model time 0.6146 (0.6812) loss 6.9451 (7.2304) grad_norm 1.8944 (2.8333) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:27:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][390/625] eta 0:02:40 lr 0.000169 wd 0.0500 time 0.6159 (0.6821) data time 0.0007 (0.0053) model time 0.6152 (0.6768) loss 6.6182 (7.2111) grad_norm 2.3782 (2.7899) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:27:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][400/625] eta 0:02:32 lr 0.000169 wd 0.0500 time 0.6172 (0.6780) data time 0.0008 (0.0051) model time 0.6164 (0.6730) loss 7.2933 (7.2182) grad_norm 1.8011 (2.7818) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:27:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][410/625] eta 0:02:24 lr 0.000169 wd 0.0500 time 0.6039 (0.6742) data time 0.0010 (0.0048) model time 0.6029 (0.6694) loss 6.0425 (7.2331) grad_norm 1.6834 (2.7719) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:27:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][420/625] eta 0:02:17 lr 0.000169 wd 0.0500 time 0.6065 (0.6706) data time 0.0009 (0.0046) model time 0.6057 (0.6659) loss 6.9911 (7.2263) grad_norm 2.2394 (2.7258) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:27:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][430/625] eta 0:02:10 lr 0.000169 wd 0.0500 time 0.6119 (0.6674) data time 0.0008 (0.0045) model time 0.6111 (0.6629) loss 6.8470 (7.2053) grad_norm 1.7053 (2.6965) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:27:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][440/625] eta 0:02:02 lr 0.000169 wd 0.0500 time 0.6028 (0.6644) data time 0.0012 (0.0043) model time 0.6015 (0.6601) loss 7.5907 (7.1918) grad_norm 1.8885 (2.6768) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:27:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][450/625] eta 0:01:55 lr 0.000169 wd 0.0500 time 0.6159 (0.6620) data time 0.0007 (0.0042) model time 0.6152 (0.6579) loss 7.1914 (7.1809) grad_norm 2.5372 (2.6620) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:27:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][460/625] eta 0:01:48 lr 0.000169 wd 0.0500 time 0.6116 (0.6600) data time 0.0010 (0.0040) model time 0.6106 (0.6559) loss 8.4824 (7.1757) grad_norm 2.1633 (2.6516) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:27:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][470/625] eta 0:01:42 lr 0.000169 wd 0.0500 time 0.6148 (0.6582) data time 0.0008 (0.0039) model time 0.6140 (0.6543) loss 5.8546 (7.1691) grad_norm 1.8877 (2.6330) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:27:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][480/625] eta 0:01:35 lr 0.000169 wd 0.0500 time 0.6089 (0.6562) data time 0.0008 (0.0038) model time 0.6081 (0.6524) loss 6.2393 (7.1761) grad_norm 2.1305 (2.6248) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:27:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-27 21:27:56 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 21:27:59 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 21:39:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 21:39:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 21:39:48 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 21:39:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-27 21:39:59 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-27 21:40:00 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-27 21:40:00 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-27 21:40:00 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 233) +[2024-07-27 21:40:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-27 21:42:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 21:42:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 21:42:28 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 21:42:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-27 21:42:52 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-27 21:42:53 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-27 21:42:53 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-27 21:42:53 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 233) +[2024-07-27 21:42:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-27 21:43:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][490/625] eta 0:03:47 lr 0.000169 wd 0.0500 time 0.5973 (1.6877) data time 0.0008 (0.0658) model time 0.5965 (1.6219) loss 7.7903 (7.5529) grad_norm 2.1330 (2.7284) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:43:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][500/625] eta 0:02:21 lr 0.000168 wd 0.0500 time 0.5736 (1.1302) data time 0.0006 (0.0333) model time 0.5730 (1.0969) loss 7.4951 (7.3889) grad_norm 3.0669 (2.7431) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:43:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][510/625] eta 0:01:48 lr 0.000168 wd 0.0500 time 0.5730 (0.9442) data time 0.0009 (0.0225) model time 0.5721 (0.9217) loss 8.1229 (7.4048) grad_norm 4.0877 (2.8782) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:43:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][520/625] eta 0:01:29 lr 0.000168 wd 0.0500 time 0.5746 (0.8515) data time 0.0007 (0.0171) model time 0.5739 (0.8344) loss 6.5158 (7.3078) grad_norm 2.1524 (2.8581) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:43:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][530/625] eta 0:01:15 lr 0.000168 wd 0.0500 time 0.5741 (0.7960) data time 0.0008 (0.0138) model time 0.5733 (0.7821) loss 6.5613 (7.2856) grad_norm 1.8262 (2.8445) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:43:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][540/625] eta 0:01:05 lr 0.000168 wd 0.0500 time 0.5857 (0.7658) data time 0.0007 (0.0117) model time 0.5850 (0.7542) loss 6.9533 (7.2434) grad_norm 2.3510 (2.7617) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:43:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][550/625] eta 0:00:55 lr 0.000168 wd 0.0500 time 0.5748 (0.7388) data time 0.0006 (0.0101) model time 0.5742 (0.7287) loss 6.5629 (7.2281) grad_norm 3.1543 (3.1792) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:43:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][560/625] eta 0:00:46 lr 0.000168 wd 0.0500 time 0.5790 (0.7184) data time 0.0009 (0.0090) model time 0.5780 (0.7094) loss 8.0092 (7.2289) grad_norm 2.6711 (3.1137) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:44:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][570/625] eta 0:00:38 lr 0.000168 wd 0.0500 time 0.5743 (0.7025) data time 0.0006 (0.0081) model time 0.5736 (0.6944) loss 8.2907 (7.2110) grad_norm 2.6809 (3.1403) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 21:44:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][580/625] eta 0:00:31 lr 0.000168 wd 0.0500 time 0.5759 (0.6900) data time 0.0009 (0.0073) model time 0.5750 (0.6827) loss 8.1536 (7.2231) grad_norm 2.3192 (3.0522) loss_scale 512.0000 (263.6800) mem 22344MB +[2024-07-27 21:44:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][590/625] eta 0:00:23 lr 0.000168 wd 0.0500 time 0.5752 (0.6796) data time 0.0008 (0.0067) model time 0.5744 (0.6729) loss 6.5529 (7.2147) grad_norm 1.8591 (3.0266) loss_scale 512.0000 (286.2545) mem 22344MB +[2024-07-27 21:44:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][600/625] eta 0:00:16 lr 0.000168 wd 0.0500 time 0.5716 (0.6709) data time 0.0007 (0.0062) model time 0.5709 (0.6647) loss 7.6831 (7.2217) grad_norm 3.0496 (2.9994) loss_scale 512.0000 (305.0667) mem 22344MB +[2024-07-27 21:44:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][610/625] eta 0:00:09 lr 0.000168 wd 0.0500 time 0.5718 (0.6637) data time 0.0005 (0.0059) model time 0.5713 (0.6578) loss 6.6697 (7.1964) grad_norm 2.5246 (3.0018) loss_scale 512.0000 (320.9846) mem 22344MB +[2024-07-27 21:44:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [233/300][620/625] eta 0:00:03 lr 0.000168 wd 0.0500 time 0.5738 (0.6573) data time 0.0004 (0.0055) model time 0.5734 (0.6518) loss 6.3145 (7.1998) grad_norm 2.0481 (2.9953) loss_scale 512.0000 (334.6286) mem 22344MB +[2024-07-27 21:44:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 233 training takes 0:01:34 +[2024-07-27 21:44:31 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 21:44:37 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 21:44:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.465 (0.465) Loss 0.5093 (0.5093) Acc@1 89.600 (89.600) Acc@5 99.023 (99.023) Mem 22344MB +[2024-07-27 21:44:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.7563 (0.6188) Acc@1 83.447 (87.833) Acc@5 96.777 (98.002) Mem 22344MB +[2024-07-27 21:44:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8413 (0.7075) Acc@1 80.762 (85.128) Acc@5 96.436 (97.156) Mem 22344MB +[2024-07-27 21:44:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.745 Acc@5 97.169 +[2024-07-27 21:44:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.7% +[2024-07-27 21:44:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 84.74% +[2024-07-27 21:44:42 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-27 21:44:43 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-27 21:44:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.464 (0.464) Loss 0.5059 (0.5059) Acc@1 90.430 (90.430) Acc@5 99.023 (99.023) Mem 22344MB +[2024-07-27 21:44:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.156) Loss 0.7451 (0.6151) Acc@1 83.398 (87.851) Acc@5 96.875 (98.091) Mem 22344MB +[2024-07-27 21:44:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.141) Loss 0.8481 (0.7022) Acc@1 79.688 (85.031) Acc@5 96.191 (97.284) Mem 22344MB +[2024-07-27 21:44:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.627 Acc@5 97.271 +[2024-07-27 21:44:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.6% +[2024-07-27 21:44:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.63% +[2024-07-27 21:44:47 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-27 21:44:50 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-27 21:44:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][0/625] eta 0:11:09 lr 0.000168 wd 0.0500 time 1.0709 (1.0709) data time 0.4144 (0.4144) model time 0.0000 (0.0000) loss 7.3923 (7.3923) grad_norm 2.6418 (2.6418) loss_scale 512.0000 (512.0000) mem 22337MB +[2024-07-27 21:44:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][10/625] eta 0:06:22 lr 0.000167 wd 0.0500 time 0.5795 (0.6216) data time 0.0008 (0.0384) model time 0.0000 (0.0000) loss 8.2325 (7.0441) grad_norm 2.3788 (2.3212) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 21:45:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][20/625] eta 0:06:03 lr 0.000167 wd 0.0500 time 0.5863 (0.6013) data time 0.0008 (0.0205) model time 0.0000 (0.0000) loss 7.4918 (7.0412) grad_norm 3.3437 (3.3841) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 21:45:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][30/625] eta 0:05:52 lr 0.000167 wd 0.0500 time 0.5750 (0.5924) data time 0.0006 (0.0142) model time 0.0000 (0.0000) loss 6.4219 (7.1087) grad_norm 1.8127 (3.2552) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 21:45:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][40/625] eta 0:05:45 lr 0.000167 wd 0.0500 time 0.5791 (0.5903) data time 0.0008 (0.0109) model time 0.0000 (0.0000) loss 6.9719 (7.0777) grad_norm 2.6158 (3.0501) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 21:45:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][50/625] eta 0:05:37 lr 0.000167 wd 0.0500 time 0.5792 (0.5876) data time 0.0008 (0.0090) model time 0.0000 (0.0000) loss 6.0972 (7.0311) grad_norm 2.5450 (2.9147) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 21:45:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][60/625] eta 0:05:30 lr 0.000167 wd 0.0500 time 0.5771 (0.5856) data time 0.0009 (0.0077) model time 0.5762 (0.5747) loss 6.7878 (7.0043) grad_norm 1.8575 (2.8581) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 21:45:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][70/625] eta 0:05:24 lr 0.000167 wd 0.0500 time 0.5795 (0.5844) data time 0.0008 (0.0067) model time 0.5787 (0.5755) loss 6.7143 (6.9857) grad_norm 4.4268 (2.8313) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 21:45:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][80/625] eta 0:05:18 lr 0.000167 wd 0.0500 time 0.5806 (0.5835) data time 0.0006 (0.0060) model time 0.5800 (0.5758) loss 7.1997 (6.9822) grad_norm 1.5362 (2.7472) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 21:45:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][90/625] eta 0:05:11 lr 0.000167 wd 0.0500 time 0.5766 (0.5828) data time 0.0008 (0.0054) model time 0.5758 (0.5759) loss 8.0824 (6.9883) grad_norm 2.0364 (2.6926) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 21:45:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][100/625] eta 0:05:05 lr 0.000167 wd 0.0500 time 0.5810 (0.5821) data time 0.0008 (0.0049) model time 0.5802 (0.5758) loss 7.6138 (7.0030) grad_norm 2.3610 (2.6240) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-27 21:45:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-27 21:45:49 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 21:45:50 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 21:47:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 21:47:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 21:48:20 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 21:48:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-27 21:48:32 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-27 21:48:32 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-27 21:48:32 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-27 21:48:32 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 234) +[2024-07-27 21:48:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-27 21:48:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][110/625] eta 0:14:49 lr 0.000167 wd 0.0500 time 0.6181 (1.7271) data time 0.0011 (0.0740) model time 0.6170 (1.6531) loss 8.0568 (7.8320) grad_norm 2.3759 (2.6565) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-27 21:49:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][120/625] eta 0:09:52 lr 0.000167 wd 0.0500 time 0.6172 (1.1723) data time 0.0009 (0.0376) model time 0.6163 (1.1348) loss 8.2949 (7.4042) grad_norm 2.2732 (2.3795) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-27 21:49:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][130/625] eta 0:08:08 lr 0.000167 wd 0.0500 time 0.6154 (0.9859) data time 0.0012 (0.0254) model time 0.6142 (0.9605) loss 8.2844 (7.5468) grad_norm 2.3593 (2.5266) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-27 21:49:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][140/625] eta 0:07:12 lr 0.000167 wd 0.0500 time 0.6135 (0.8928) data time 0.0009 (0.0193) model time 0.6125 (0.8734) loss 6.1382 (7.3531) grad_norm 2.7556 (2.5002) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-27 21:49:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][150/625] eta 0:06:37 lr 0.000166 wd 0.0500 time 0.6136 (0.8372) data time 0.0012 (0.0157) model time 0.6124 (0.8215) loss 6.9610 (7.3374) grad_norm 1.7815 (2.5456) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-27 21:49:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][160/625] eta 0:06:15 lr 0.000166 wd 0.0500 time 0.6194 (0.8068) data time 0.0011 (0.0133) model time 0.6183 (0.7935) loss 7.3682 (7.2776) grad_norm 2.4285 (2.5055) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-27 21:49:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][170/625] eta 0:05:55 lr 0.000166 wd 0.0500 time 0.6225 (0.7804) data time 0.0009 (0.0115) model time 0.6216 (0.7689) loss 6.0453 (7.2553) grad_norm 1.8472 (2.4527) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-27 21:49:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][180/625] eta 0:05:38 lr 0.000166 wd 0.0500 time 0.6270 (0.7607) data time 0.0011 (0.0102) model time 0.6260 (0.7505) loss 7.6816 (7.2221) grad_norm 2.2775 (2.4209) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-27 21:49:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][190/625] eta 0:05:24 lr 0.000166 wd 0.0500 time 0.6239 (0.7454) data time 0.0009 (0.0092) model time 0.6231 (0.7363) loss 7.5795 (7.1881) grad_norm 2.0172 (2.4006) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-27 21:49:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][200/625] eta 0:05:11 lr 0.000166 wd 0.0500 time 0.6175 (0.7330) data time 0.0011 (0.0084) model time 0.6164 (0.7246) loss 8.4459 (7.2102) grad_norm 1.9735 (2.4205) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-27 21:49:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][210/625] eta 0:04:59 lr 0.000166 wd 0.0500 time 0.6184 (0.7225) data time 0.0011 (0.0077) model time 0.6173 (0.7148) loss 6.6255 (7.2205) grad_norm 2.1921 (2.4236) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-27 21:50:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][220/625] eta 0:04:49 lr 0.000166 wd 0.0500 time 0.6140 (0.7137) data time 0.0008 (0.0072) model time 0.6131 (0.7066) loss 7.1370 (7.2300) grad_norm 1.8512 (2.4565) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-27 21:50:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][230/625] eta 0:04:39 lr 0.000166 wd 0.0500 time 0.6196 (0.7064) data time 0.0008 (0.0067) model time 0.6188 (0.6997) loss 6.3251 (7.1788) grad_norm 1.7913 (2.4728) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-27 21:50:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][240/625] eta 0:04:29 lr 0.000166 wd 0.0500 time 0.6238 (0.7005) data time 0.0009 (0.0063) model time 0.6230 (0.6942) loss 5.8370 (7.1785) grad_norm 2.1498 (2.4460) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-27 21:50:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][250/625] eta 0:04:20 lr 0.000166 wd 0.0500 time 0.6264 (0.6954) data time 0.0011 (0.0059) model time 0.6253 (0.6895) loss 7.8286 (7.1731) grad_norm 2.9866 (2.4299) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-27 21:50:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][260/625] eta 0:04:12 lr 0.000166 wd 0.0500 time 0.6296 (0.6911) data time 0.0011 (0.0056) model time 0.6284 (0.6855) loss 6.9599 (7.1595) grad_norm 3.8758 (2.5298) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-27 21:50:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][270/625] eta 0:04:03 lr 0.000166 wd 0.0500 time 0.6197 (0.6871) data time 0.0009 (0.0054) model time 0.6187 (0.6817) loss 5.4653 (7.1572) grad_norm 3.8668 (2.5229) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-27 21:50:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][280/625] eta 0:03:55 lr 0.000166 wd 0.0500 time 0.6218 (0.6833) data time 0.0008 (0.0052) model time 0.6210 (0.6781) loss 6.2140 (7.1385) grad_norm 2.5919 (2.5184) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-27 21:50:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][290/625] eta 0:03:47 lr 0.000165 wd 0.0500 time 0.6165 (0.6800) data time 0.0008 (0.0049) model time 0.6156 (0.6751) loss 6.5282 (7.1476) grad_norm 2.6383 (2.5213) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-27 21:50:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][300/625] eta 0:03:40 lr 0.000165 wd 0.0500 time 0.6176 (0.6769) data time 0.0011 (0.0047) model time 0.6166 (0.6722) loss 7.7106 (7.1278) grad_norm 2.7140 (2.5164) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-27 21:50:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][310/625] eta 0:03:32 lr 0.000165 wd 0.0500 time 0.6274 (0.6744) data time 0.0009 (0.0046) model time 0.6265 (0.6698) loss 6.2029 (7.1162) grad_norm 2.4850 (2.5899) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-27 21:51:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][320/625] eta 0:03:25 lr 0.000165 wd 0.0500 time 0.6298 (0.6722) data time 0.0011 (0.0044) model time 0.6287 (0.6678) loss 7.0019 (7.1181) grad_norm 2.4899 (2.5938) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-27 21:51:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][330/625] eta 0:03:17 lr 0.000165 wd 0.0500 time 0.6276 (0.6702) data time 0.0011 (0.0043) model time 0.6265 (0.6659) loss 7.9868 (7.1231) grad_norm 3.2515 (2.5997) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-27 21:51:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][340/625] eta 0:03:10 lr 0.000165 wd 0.0500 time 0.6113 (0.6683) data time 0.0011 (0.0041) model time 0.6102 (0.6641) loss 8.2211 (7.1167) grad_norm inf (inf) loss_scale 256.0000 (510.9333) mem 22343MB +[2024-07-27 21:51:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][350/625] eta 0:03:03 lr 0.000165 wd 0.0500 time 0.6152 (0.6663) data time 0.0008 (0.0040) model time 0.6144 (0.6623) loss 5.6352 (7.0994) grad_norm 3.4844 (inf) loss_scale 256.0000 (500.7360) mem 22343MB +[2024-07-27 21:51:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][360/625] eta 0:02:56 lr 0.000165 wd 0.0500 time 0.6178 (0.6646) data time 0.0011 (0.0039) model time 0.6167 (0.6607) loss 6.7173 (7.0918) grad_norm 2.2114 (inf) loss_scale 256.0000 (491.3231) mem 22343MB +[2024-07-27 21:51:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][370/625] eta 0:02:49 lr 0.000165 wd 0.0500 time 0.6201 (0.6629) data time 0.0008 (0.0038) model time 0.6193 (0.6591) loss 7.3263 (7.0834) grad_norm 1.9949 (inf) loss_scale 256.0000 (482.6074) mem 22343MB +[2024-07-27 21:51:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][380/625] eta 0:02:42 lr 0.000165 wd 0.0500 time 0.6267 (0.6623) data time 0.0011 (0.0037) model time 0.6255 (0.6586) loss 7.5853 (7.0929) grad_norm 1.9173 (inf) loss_scale 256.0000 (474.5143) mem 22343MB +[2024-07-27 21:51:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][390/625] eta 0:02:35 lr 0.000165 wd 0.0500 time 0.6279 (0.6610) data time 0.0011 (0.0036) model time 0.6269 (0.6574) loss 6.1940 (7.0901) grad_norm 2.4095 (inf) loss_scale 256.0000 (466.9793) mem 22343MB +[2024-07-27 21:51:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][400/625] eta 0:02:28 lr 0.000165 wd 0.0500 time 0.6295 (0.6599) data time 0.0008 (0.0035) model time 0.6287 (0.6564) loss 6.5792 (7.0834) grad_norm 1.6750 (inf) loss_scale 256.0000 (459.9467) mem 22343MB +[2024-07-27 21:52:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][410/625] eta 0:02:21 lr 0.000165 wd 0.0500 time 0.6263 (0.6589) data time 0.0011 (0.0034) model time 0.6253 (0.6554) loss 6.4618 (7.0855) grad_norm 3.0090 (inf) loss_scale 256.0000 (453.3677) mem 22343MB +[2024-07-27 21:52:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][420/625] eta 0:02:14 lr 0.000165 wd 0.0500 time 0.6209 (0.6577) data time 0.0011 (0.0034) model time 0.6197 (0.6543) loss 6.7955 (7.1055) grad_norm 1.8144 (inf) loss_scale 256.0000 (447.2000) mem 22343MB +[2024-07-27 21:52:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][430/625] eta 0:02:08 lr 0.000164 wd 0.0500 time 0.6179 (0.6566) data time 0.0009 (0.0033) model time 0.6170 (0.6532) loss 6.7891 (7.1099) grad_norm 2.0917 (inf) loss_scale 256.0000 (441.4061) mem 22343MB +[2024-07-27 21:52:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][440/625] eta 0:02:01 lr 0.000164 wd 0.0500 time 0.6191 (0.6555) data time 0.0011 (0.0033) model time 0.6181 (0.6523) loss 6.4727 (7.1133) grad_norm 2.3581 (inf) loss_scale 256.0000 (435.9529) mem 22343MB +[2024-07-27 21:52:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][450/625] eta 0:01:54 lr 0.000164 wd 0.0500 time 0.6143 (0.6545) data time 0.0011 (0.0032) model time 0.6132 (0.6513) loss 6.4060 (7.1153) grad_norm 2.4286 (inf) loss_scale 256.0000 (430.8114) mem 22343MB +[2024-07-27 21:52:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][460/625] eta 0:01:47 lr 0.000164 wd 0.0500 time 0.6262 (0.6537) data time 0.0009 (0.0031) model time 0.6253 (0.6505) loss 7.4299 (7.1185) grad_norm 2.0074 (inf) loss_scale 256.0000 (425.9556) mem 22343MB +[2024-07-27 21:52:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][470/625] eta 0:01:41 lr 0.000164 wd 0.0500 time 0.6264 (0.6529) data time 0.0011 (0.0031) model time 0.6253 (0.6498) loss 7.6978 (7.1191) grad_norm 2.2777 (inf) loss_scale 256.0000 (421.3622) mem 22343MB +[2024-07-27 21:52:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][480/625] eta 0:01:34 lr 0.000164 wd 0.0500 time 0.6345 (0.6522) data time 0.0009 (0.0030) model time 0.6336 (0.6492) loss 6.5240 (7.1112) grad_norm 3.2359 (inf) loss_scale 256.0000 (417.0105) mem 22343MB +[2024-07-27 21:52:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][490/625] eta 0:01:27 lr 0.000164 wd 0.0500 time 0.6181 (0.6515) data time 0.0010 (0.0030) model time 0.6171 (0.6485) loss 6.6142 (7.0977) grad_norm 2.7364 (inf) loss_scale 256.0000 (412.8821) mem 22343MB +[2024-07-27 21:52:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][500/625] eta 0:01:21 lr 0.000164 wd 0.0500 time 0.6225 (0.6507) data time 0.0011 (0.0029) model time 0.6214 (0.6478) loss 7.5382 (7.1032) grad_norm 3.0471 (inf) loss_scale 256.0000 (408.9600) mem 22343MB +[2024-07-27 21:53:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][510/625] eta 0:01:14 lr 0.000164 wd 0.0500 time 0.6191 (0.6499) data time 0.0008 (0.0029) model time 0.6184 (0.6471) loss 7.2000 (7.1096) grad_norm 2.5016 (inf) loss_scale 256.0000 (405.2293) mem 22343MB +[2024-07-27 21:53:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][520/625] eta 0:01:08 lr 0.000164 wd 0.0500 time 0.6191 (0.6493) data time 0.0008 (0.0028) model time 0.6183 (0.6464) loss 6.9603 (7.1035) grad_norm 2.6091 (inf) loss_scale 256.0000 (401.6762) mem 22343MB +[2024-07-27 21:53:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][530/625] eta 0:01:01 lr 0.000164 wd 0.0500 time 0.6268 (0.6487) data time 0.0011 (0.0028) model time 0.6257 (0.6459) loss 7.8663 (7.1124) grad_norm 4.0926 (inf) loss_scale 256.0000 (398.2884) mem 22343MB +[2024-07-27 21:53:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][540/625] eta 0:00:55 lr 0.000164 wd 0.0500 time 0.6219 (0.6482) data time 0.0011 (0.0028) model time 0.6208 (0.6455) loss 6.7808 (7.1138) grad_norm 3.6632 (inf) loss_scale 256.0000 (395.0545) mem 22343MB +[2024-07-27 21:53:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][550/625] eta 0:00:48 lr 0.000164 wd 0.0500 time 0.6274 (0.6478) data time 0.0008 (0.0027) model time 0.6266 (0.6451) loss 7.2416 (7.1096) grad_norm 2.7730 (inf) loss_scale 256.0000 (391.9644) mem 22343MB +[2024-07-27 21:53:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][560/625] eta 0:00:42 lr 0.000164 wd 0.0500 time 0.6238 (0.6473) data time 0.0008 (0.0027) model time 0.6229 (0.6447) loss 7.5312 (7.1075) grad_norm 2.2824 (inf) loss_scale 256.0000 (389.0087) mem 22343MB +[2024-07-27 21:53:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][570/625] eta 0:00:35 lr 0.000163 wd 0.0500 time 0.6363 (0.6468) data time 0.0011 (0.0027) model time 0.6353 (0.6441) loss 7.2952 (7.0990) grad_norm 12.3563 (inf) loss_scale 256.0000 (386.1787) mem 22343MB +[2024-07-27 21:53:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][580/625] eta 0:00:29 lr 0.000163 wd 0.0500 time 0.6180 (0.6462) data time 0.0011 (0.0026) model time 0.6169 (0.6436) loss 5.9857 (7.0939) grad_norm 2.1216 (inf) loss_scale 256.0000 (383.4667) mem 22343MB +[2024-07-27 21:53:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][590/625] eta 0:00:22 lr 0.000163 wd 0.0500 time 0.6190 (0.6457) data time 0.0009 (0.0026) model time 0.6181 (0.6431) loss 6.5431 (7.0977) grad_norm 2.8521 (inf) loss_scale 256.0000 (380.8653) mem 22343MB +[2024-07-27 21:54:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][600/625] eta 0:00:16 lr 0.000163 wd 0.0500 time 0.6252 (0.6459) data time 0.0009 (0.0026) model time 0.6243 (0.6433) loss 6.6559 (7.0993) grad_norm 2.3236 (inf) loss_scale 256.0000 (378.3680) mem 22343MB +[2024-07-27 21:54:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][610/625] eta 0:00:09 lr 0.000163 wd 0.0500 time 0.6209 (0.6454) data time 0.0008 (0.0026) model time 0.6201 (0.6429) loss 7.8141 (7.1128) grad_norm 4.4142 (inf) loss_scale 256.0000 (375.9686) mem 22343MB +[2024-07-27 21:54:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [234/300][620/625] eta 0:00:03 lr 0.000163 wd 0.0500 time 0.6277 (0.6451) data time 0.0006 (0.0025) model time 0.6271 (0.6425) loss 7.1193 (7.1139) grad_norm 3.6946 (inf) loss_scale 256.0000 (373.6615) mem 22343MB +[2024-07-27 21:54:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 234 training takes 0:05:37 +[2024-07-27 21:54:15 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 21:54:18 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 21:54:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.519 (0.519) Loss 0.5078 (0.5078) Acc@1 89.893 (89.893) Acc@5 99.023 (99.023) Mem 22343MB +[2024-07-27 21:54:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.162) Loss 0.7344 (0.6125) Acc@1 83.057 (87.775) Acc@5 97.266 (98.118) Mem 22343MB +[2024-07-27 21:54:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.145) Loss 0.8579 (0.7014) Acc@1 79.980 (85.110) Acc@5 95.996 (97.268) Mem 22343MB +[2024-07-27 21:54:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.711 Acc@5 97.265 +[2024-07-27 21:54:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.7% +[2024-07-27 21:54:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.839 (0.839) Loss 0.5049 (0.5049) Acc@1 90.430 (90.430) Acc@5 99.023 (99.023) Mem 22343MB +[2024-07-27 21:54:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.192) Loss 0.7451 (0.6147) Acc@1 83.350 (87.860) Acc@5 96.875 (98.105) Mem 22343MB +[2024-07-27 21:54:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.160) Loss 0.8481 (0.7018) Acc@1 79.736 (85.042) Acc@5 96.191 (97.296) Mem 22343MB +[2024-07-27 21:54:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.635 Acc@5 97.285 +[2024-07-27 21:54:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.6% +[2024-07-27 21:54:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.63% +[2024-07-27 21:54:28 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-27 21:54:33 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-27 21:54:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][0/625] eta 0:12:03 lr 0.000163 wd 0.0500 time 1.1572 (1.1572) data time 0.4392 (0.4392) model time 0.0000 (0.0000) loss 8.5504 (8.5504) grad_norm 2.3556 (2.3556) loss_scale 256.0000 (256.0000) mem 22337MB +[2024-07-27 21:54:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][10/625] eta 0:06:51 lr 0.000163 wd 0.0500 time 0.6185 (0.6696) data time 0.0008 (0.0409) model time 0.0000 (0.0000) loss 6.7668 (7.2370) grad_norm 2.4900 (2.6090) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:54:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][20/625] eta 0:06:31 lr 0.000163 wd 0.0500 time 0.6175 (0.6465) data time 0.0009 (0.0219) model time 0.0000 (0.0000) loss 6.2899 (7.1296) grad_norm 2.2077 (2.6351) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:54:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][30/625] eta 0:06:19 lr 0.000163 wd 0.0500 time 0.6130 (0.6370) data time 0.0012 (0.0152) model time 0.0000 (0.0000) loss 7.1811 (7.2030) grad_norm 2.6789 (2.8387) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:54:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][40/625] eta 0:06:10 lr 0.000163 wd 0.0500 time 0.6268 (0.6326) data time 0.0012 (0.0118) model time 0.0000 (0.0000) loss 7.0812 (7.2312) grad_norm 2.7531 (2.8053) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:55:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][50/625] eta 0:06:02 lr 0.000163 wd 0.0500 time 0.6163 (0.6300) data time 0.0011 (0.0097) model time 0.0000 (0.0000) loss 8.0954 (7.1907) grad_norm 2.2015 (3.1528) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:55:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][60/625] eta 0:05:55 lr 0.000163 wd 0.0500 time 0.6235 (0.6283) data time 0.0011 (0.0083) model time 0.6223 (0.6189) loss 5.8897 (7.2052) grad_norm 2.4079 (3.0266) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:55:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][70/625] eta 0:05:48 lr 0.000163 wd 0.0500 time 0.6215 (0.6274) data time 0.0009 (0.0073) model time 0.6206 (0.6197) loss 8.5176 (7.2362) grad_norm 4.2080 (2.9853) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:55:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][80/625] eta 0:05:41 lr 0.000163 wd 0.0500 time 0.6252 (0.6267) data time 0.0009 (0.0065) model time 0.6243 (0.6201) loss 7.1620 (7.2235) grad_norm 1.8220 (2.9328) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:55:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][90/625] eta 0:05:35 lr 0.000162 wd 0.0500 time 0.6281 (0.6263) data time 0.0008 (0.0059) model time 0.6273 (0.6205) loss 8.0421 (7.2209) grad_norm 2.8905 (2.8566) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:55:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][100/625] eta 0:05:28 lr 0.000162 wd 0.0500 time 0.6206 (0.6257) data time 0.0011 (0.0054) model time 0.6195 (0.6202) loss 7.5357 (7.2165) grad_norm 2.2475 (2.8531) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:55:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][110/625] eta 0:05:21 lr 0.000162 wd 0.0500 time 0.6190 (0.6251) data time 0.0011 (0.0050) model time 0.6179 (0.6197) loss 6.9921 (7.2064) grad_norm 5.0402 (2.8981) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:55:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][120/625] eta 0:05:15 lr 0.000162 wd 0.0500 time 0.6192 (0.6246) data time 0.0009 (0.0047) model time 0.6183 (0.6196) loss 6.4351 (7.1547) grad_norm 4.5008 (2.9443) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:55:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][130/625] eta 0:05:09 lr 0.000162 wd 0.0500 time 0.6200 (0.6244) data time 0.0009 (0.0044) model time 0.6191 (0.6198) loss 7.0602 (7.1481) grad_norm 1.8709 (2.9212) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:56:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][140/625] eta 0:05:02 lr 0.000162 wd 0.0500 time 0.6213 (0.6243) data time 0.0009 (0.0042) model time 0.6204 (0.6200) loss 6.3598 (7.1083) grad_norm 2.7333 (2.8838) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:56:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][150/625] eta 0:04:56 lr 0.000162 wd 0.0500 time 0.6290 (0.6244) data time 0.0009 (0.0040) model time 0.6281 (0.6204) loss 7.2277 (7.1098) grad_norm 2.7489 (2.9828) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:56:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][160/625] eta 0:04:50 lr 0.000162 wd 0.0500 time 0.6244 (0.6243) data time 0.0011 (0.0038) model time 0.6233 (0.6206) loss 7.4915 (7.1000) grad_norm 2.3089 (2.9952) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:56:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][170/625] eta 0:04:44 lr 0.000162 wd 0.0500 time 0.6202 (0.6242) data time 0.0011 (0.0037) model time 0.6191 (0.6207) loss 6.2993 (7.0853) grad_norm 2.4225 (2.9713) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:56:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][180/625] eta 0:04:37 lr 0.000162 wd 0.0500 time 0.6212 (0.6240) data time 0.0008 (0.0035) model time 0.6204 (0.6205) loss 8.1515 (7.0926) grad_norm 3.3912 (2.9529) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:56:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][190/625] eta 0:04:31 lr 0.000162 wd 0.0500 time 0.8074 (0.6249) data time 0.0011 (0.0034) model time 0.8063 (0.6219) loss 8.1062 (7.0847) grad_norm 2.9645 (2.9420) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 21:56:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-27 21:56:33 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 21:56:35 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 21:59:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 21:59:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 21:59:28 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 21:59:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-27 21:59:41 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-27 21:59:42 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-27 21:59:42 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-27 21:59:42 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 235) +[2024-07-27 21:59:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-27 22:00:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][200/625] eta 0:13:20 lr 0.000162 wd 0.0500 time 0.5960 (1.8824) data time 0.0008 (0.0781) model time 0.5952 (1.8042) loss 7.7368 (7.5696) grad_norm 2.9132 (3.2096) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:00:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][210/625] eta 0:08:21 lr 0.000162 wd 0.0500 time 0.5996 (1.2081) data time 0.0010 (0.0376) model time 0.5986 (1.1705) loss 6.6383 (7.3950) grad_norm 2.5627 (2.9888) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:00:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][220/625] eta 0:06:43 lr 0.000162 wd 0.0500 time 0.5946 (0.9971) data time 0.0008 (0.0250) model time 0.5938 (0.9721) loss 7.9380 (7.4875) grad_norm 1.9690 (2.7925) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:00:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][230/625] eta 0:05:53 lr 0.000161 wd 0.0500 time 0.5973 (0.8944) data time 0.0009 (0.0188) model time 0.5963 (0.8756) loss 7.1446 (7.4388) grad_norm 1.9447 (2.8070) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:00:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][240/625] eta 0:05:20 lr 0.000161 wd 0.0500 time 0.5992 (0.8335) data time 0.0010 (0.0152) model time 0.5982 (0.8183) loss 6.4854 (7.3369) grad_norm 4.3652 (2.8956) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:00:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][250/625] eta 0:05:00 lr 0.000161 wd 0.0500 time 0.6052 (0.8003) data time 0.0008 (0.0128) model time 0.6044 (0.7874) loss 6.1034 (7.2745) grad_norm 3.2411 (2.9021) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:00:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][260/625] eta 0:04:41 lr 0.000161 wd 0.0500 time 0.6054 (0.7722) data time 0.0009 (0.0111) model time 0.6045 (0.7610) loss 7.1214 (7.2424) grad_norm 1.7788 (2.9364) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:00:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][270/625] eta 0:04:26 lr 0.000161 wd 0.0500 time 0.6040 (0.7513) data time 0.0010 (0.0098) model time 0.6030 (0.7414) loss 7.2257 (7.2348) grad_norm 2.1381 (2.8472) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:00:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][280/625] eta 0:04:13 lr 0.000161 wd 0.0500 time 0.6067 (0.7352) data time 0.0007 (0.0088) model time 0.6060 (0.7264) loss 7.0452 (7.2286) grad_norm 2.2966 (2.7769) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:00:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][290/625] eta 0:04:01 lr 0.000161 wd 0.0500 time 0.5995 (0.7220) data time 0.0010 (0.0080) model time 0.5985 (0.7140) loss 7.4082 (7.2440) grad_norm 1.9966 (2.7370) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:01:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][300/625] eta 0:03:51 lr 0.000161 wd 0.0500 time 0.5967 (0.7109) data time 0.0007 (0.0074) model time 0.5959 (0.7035) loss 7.6199 (7.2538) grad_norm 1.7857 (2.6829) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:01:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][310/625] eta 0:03:40 lr 0.000161 wd 0.0500 time 0.6009 (0.7015) data time 0.0009 (0.0069) model time 0.6000 (0.6947) loss 7.6908 (7.2567) grad_norm 2.5917 (2.6490) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:01:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][320/625] eta 0:03:31 lr 0.000161 wd 0.0500 time 0.5987 (0.6937) data time 0.0008 (0.0064) model time 0.5979 (0.6872) loss 6.6129 (7.2383) grad_norm 2.3446 (2.7230) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:01:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][330/625] eta 0:03:22 lr 0.000161 wd 0.0500 time 0.6055 (0.6873) data time 0.0008 (0.0060) model time 0.6047 (0.6813) loss 7.8697 (7.2367) grad_norm 1.7660 (2.6913) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:01:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][340/625] eta 0:03:14 lr 0.000161 wd 0.0500 time 0.6090 (0.6822) data time 0.0008 (0.0057) model time 0.6083 (0.6765) loss 6.9103 (7.2040) grad_norm 2.1288 (2.7512) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:01:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][350/625] eta 0:03:06 lr 0.000161 wd 0.0500 time 0.6073 (0.6776) data time 0.0007 (0.0054) model time 0.6066 (0.6722) loss 7.9307 (7.2127) grad_norm 1.9684 (2.7468) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:01:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][360/625] eta 0:02:58 lr 0.000161 wd 0.0500 time 0.5974 (0.6735) data time 0.0010 (0.0051) model time 0.5964 (0.6684) loss 6.3127 (7.2210) grad_norm 2.7260 (2.7471) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:01:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][370/625] eta 0:02:50 lr 0.000160 wd 0.0500 time 0.6011 (0.6695) data time 0.0008 (0.0049) model time 0.6003 (0.6646) loss 6.8031 (7.1971) grad_norm 1.7515 (2.7638) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:01:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][380/625] eta 0:02:43 lr 0.000160 wd 0.0500 time 0.5997 (0.6659) data time 0.0007 (0.0047) model time 0.5990 (0.6612) loss 7.1445 (7.1921) grad_norm 2.0776 (2.9449) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:01:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][390/625] eta 0:02:35 lr 0.000160 wd 0.0500 time 0.6000 (0.6627) data time 0.0009 (0.0045) model time 0.5991 (0.6582) loss 6.5527 (7.1712) grad_norm 2.5346 (2.9093) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:02:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][400/625] eta 0:02:28 lr 0.000160 wd 0.0500 time 0.6038 (0.6599) data time 0.0007 (0.0043) model time 0.6031 (0.6555) loss 6.8597 (7.1524) grad_norm 2.4957 (2.8913) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:02:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][410/625] eta 0:02:21 lr 0.000160 wd 0.0500 time 0.6080 (0.6576) data time 0.0007 (0.0042) model time 0.6073 (0.6534) loss 7.8163 (7.1500) grad_norm 2.2194 (2.8686) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:02:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][420/625] eta 0:02:14 lr 0.000160 wd 0.0500 time 0.6068 (0.6554) data time 0.0007 (0.0041) model time 0.6061 (0.6514) loss 5.9566 (7.1401) grad_norm 3.2479 (2.8500) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:02:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][430/625] eta 0:02:07 lr 0.000160 wd 0.0500 time 0.6030 (0.6536) data time 0.0008 (0.0039) model time 0.6022 (0.6497) loss 6.2364 (7.1375) grad_norm 2.5840 (2.8341) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:02:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][440/625] eta 0:02:00 lr 0.000160 wd 0.0500 time 0.6007 (0.6515) data time 0.0009 (0.0038) model time 0.5998 (0.6477) loss 7.1723 (7.1329) grad_norm 2.0480 (2.8096) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:02:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][450/625] eta 0:01:53 lr 0.000160 wd 0.0500 time 0.6022 (0.6496) data time 0.0009 (0.0037) model time 0.6013 (0.6459) loss 8.8085 (7.1258) grad_norm 1.6569 (2.7833) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:02:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][460/625] eta 0:01:46 lr 0.000160 wd 0.0500 time 0.5989 (0.6479) data time 0.0008 (0.0036) model time 0.5982 (0.6443) loss 6.6271 (7.1176) grad_norm 3.0889 (2.7688) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:02:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][470/625] eta 0:01:40 lr 0.000160 wd 0.0500 time 0.6088 (0.6472) data time 0.0009 (0.0035) model time 0.6078 (0.6437) loss 7.5260 (7.1229) grad_norm 2.3141 (2.7680) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:02:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][480/625] eta 0:01:33 lr 0.000160 wd 0.0500 time 0.6096 (0.6459) data time 0.0009 (0.0034) model time 0.6086 (0.6425) loss 6.6028 (7.1261) grad_norm 3.6356 (2.7641) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:02:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][490/625] eta 0:01:27 lr 0.000160 wd 0.0500 time 0.6072 (0.6447) data time 0.0010 (0.0033) model time 0.6062 (0.6414) loss 7.9295 (7.1121) grad_norm 1.8304 (2.7494) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:03:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][500/625] eta 0:01:20 lr 0.000160 wd 0.0500 time 0.6066 (0.6435) data time 0.0010 (0.0033) model time 0.6056 (0.6403) loss 8.4716 (7.1210) grad_norm 2.1033 (2.7387) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 22:03:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-27 22:03:06 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 22:03:09 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 22:09:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 22:09:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 22:09:41 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 22:10:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-27 22:10:11 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-27 22:10:12 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-27 22:10:12 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-27 22:10:12 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 235) +[2024-07-27 22:10:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-27 22:10:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][510/625] eta 0:03:14 lr 0.000159 wd 0.0500 time 0.5955 (1.6909) data time 0.0011 (0.0603) model time 0.5944 (1.6306) loss 7.8475 (7.3170) grad_norm 2.1056 (2.2882) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:10:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][520/625] eta 0:01:59 lr 0.000159 wd 0.0500 time 0.5924 (1.1417) data time 0.0008 (0.0307) model time 0.5916 (1.1110) loss 7.7859 (7.2937) grad_norm 2.5828 (2.3755) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:10:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][530/625] eta 0:01:30 lr 0.000159 wd 0.0500 time 0.5853 (0.9573) data time 0.0010 (0.0208) model time 0.5842 (0.9365) loss 7.8933 (7.3981) grad_norm 2.3056 (2.4394) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:10:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][540/625] eta 0:01:13 lr 0.000159 wd 0.0500 time 0.5875 (0.8649) data time 0.0008 (0.0159) model time 0.5867 (0.8490) loss 5.5914 (7.1919) grad_norm 2.5139 (2.5757) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:10:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][550/625] eta 0:01:00 lr 0.000159 wd 0.0500 time 0.5896 (0.8094) data time 0.0011 (0.0129) model time 0.5884 (0.7965) loss 6.6649 (7.1950) grad_norm 7.0185 (2.6497) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:11:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][560/625] eta 0:00:50 lr 0.000159 wd 0.0500 time 0.5888 (0.7798) data time 0.0009 (0.0110) model time 0.5879 (0.7688) loss 6.4116 (7.1763) grad_norm 4.1184 (2.6415) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:11:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][570/625] eta 0:00:41 lr 0.000159 wd 0.0500 time 0.5930 (0.7531) data time 0.0008 (0.0096) model time 0.5922 (0.7436) loss 6.3835 (7.1618) grad_norm 1.9690 (2.6241) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:11:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-27 22:11:11 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 22:11:16 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 22:14:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 22:14:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 22:14:45 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 22:16:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 22:16:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 22:16:39 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 22:16:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-27 22:16:51 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-27 22:16:51 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-27 22:16:51 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-27 22:16:52 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 235) +[2024-07-27 22:16:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-27 22:17:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][580/625] eta 0:01:26 lr 0.000159 wd 0.0500 time 0.5791 (1.9214) data time 0.0007 (0.1014) model time 0.5784 (1.8200) loss 6.7649 (7.3823) grad_norm 2.5227 (3.3423) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:17:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][590/625] eta 0:00:41 lr 0.000159 wd 0.0500 time 0.5767 (1.1735) data time 0.0007 (0.0456) model time 0.5759 (1.1279) loss 6.9077 (7.3335) grad_norm 2.3470 (2.9922) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:17:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][600/625] eta 0:00:23 lr 0.000159 wd 0.0500 time 0.5720 (0.9590) data time 0.0009 (0.0297) model time 0.5710 (0.9292) loss 8.4213 (7.4813) grad_norm 2.5154 (2.7778) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:17:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][610/625] eta 0:00:12 lr 0.000159 wd 0.0500 time 0.5713 (0.8572) data time 0.0006 (0.0222) model time 0.5707 (0.8350) loss 7.4769 (7.4333) grad_norm 1.8025 (2.7147) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:17:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [235/300][620/625] eta 0:00:03 lr 0.000159 wd 0.0500 time 0.5732 (0.7978) data time 0.0004 (0.0177) model time 0.5728 (0.7801) loss 6.7038 (7.3445) grad_norm 2.4744 (2.6209) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:17:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 235 training takes 0:00:40 +[2024-07-27 22:17:36 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 22:17:39 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 22:17:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.462 (0.462) Loss 0.4927 (0.4927) Acc@1 90.527 (90.527) Acc@5 99.023 (99.023) Mem 22341MB +[2024-07-27 22:17:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.155) Loss 0.7500 (0.6053) Acc@1 82.324 (87.877) Acc@5 97.021 (98.082) Mem 22341MB +[2024-07-27 22:17:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.141) Loss 0.8281 (0.6951) Acc@1 80.518 (85.152) Acc@5 96.387 (97.268) Mem 22341MB +[2024-07-27 22:17:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.795 Acc@5 97.275 +[2024-07-27 22:17:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.8% +[2024-07-27 22:17:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 84.79% +[2024-07-27 22:17:45 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-27 22:17:48 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-27 22:17:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.459 (0.459) Loss 0.5049 (0.5049) Acc@1 90.430 (90.430) Acc@5 99.023 (99.023) Mem 22341MB +[2024-07-27 22:17:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.124 (0.155) Loss 0.7451 (0.6144) Acc@1 83.350 (87.886) Acc@5 96.924 (98.096) Mem 22341MB +[2024-07-27 22:17:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.124 (0.140) Loss 0.8472 (0.7012) Acc@1 79.785 (85.075) Acc@5 96.240 (97.303) Mem 22341MB +[2024-07-27 22:17:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.663 Acc@5 97.291 +[2024-07-27 22:17:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.7% +[2024-07-27 22:17:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.66% +[2024-07-27 22:17:51 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-27 22:17:54 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-27 22:17:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][0/625] eta 0:11:21 lr 0.000159 wd 0.0500 time 1.0904 (1.0904) data time 0.3377 (0.3377) model time 0.0000 (0.0000) loss 6.9857 (6.9857) grad_norm 2.0179 (2.0179) loss_scale 256.0000 (256.0000) mem 22337MB +[2024-07-27 22:18:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][10/625] eta 0:06:36 lr 0.000159 wd 0.0500 time 0.5783 (0.6447) data time 0.0009 (0.0315) model time 0.0000 (0.0000) loss 7.5899 (6.8224) grad_norm 2.0121 (2.2125) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:18:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][20/625] eta 0:06:10 lr 0.000159 wd 0.0500 time 0.5838 (0.6130) data time 0.0010 (0.0169) model time 0.0000 (0.0000) loss 7.6598 (6.9338) grad_norm 2.6870 (2.2364) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:18:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-27 22:18:12 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 22:18:15 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 22:23:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 22:23:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 22:23:46 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 22:24:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-27 22:24:00 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-27 22:24:01 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-27 22:24:01 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-27 22:24:01 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 236) +[2024-07-27 22:24:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-27 22:24:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][30/625] eta 1:46:11 lr 0.000158 wd 0.0500 time 10.7077 (10.7077) data time 0.6108 (0.6108) model time 0.0000 (0.0000) loss 7.6020 (7.6020) grad_norm 4.3295 (4.3295) loss_scale 256.0000 (256.0000) mem 26016MB +[2024-07-27 22:24:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][40/625] eta 0:15:24 lr 0.000158 wd 0.0500 time 0.5682 (1.5801) data time 0.0008 (0.0564) model time 0.0000 (0.0000) loss 6.6730 (7.2960) grad_norm 2.1818 (2.2423) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:24:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][50/625] eta 0:10:31 lr 0.000158 wd 0.0500 time 0.5665 (1.0981) data time 0.0008 (0.0300) model time 0.0000 (0.0000) loss 7.3158 (7.2547) grad_norm 2.3602 (2.2025) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:24:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][60/625] eta 0:08:43 lr 0.000158 wd 0.0500 time 0.5712 (0.9274) data time 0.0007 (0.0206) model time 0.5705 (0.5679) loss 5.9787 (7.3063) grad_norm 1.7246 (2.2457) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:24:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][70/625] eta 0:07:46 lr 0.000158 wd 0.0500 time 0.5670 (0.8404) data time 0.0008 (0.0158) model time 0.5662 (0.5688) loss 6.8370 (7.2529) grad_norm 1.7488 (2.3546) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:24:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][80/625] eta 0:07:10 lr 0.000158 wd 0.0500 time 0.7409 (0.7904) data time 0.0006 (0.0129) model time 0.7403 (0.5740) loss 7.9687 (7.2806) grad_norm 2.0371 (2.5113) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:24:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][90/625] eta 0:06:44 lr 0.000158 wd 0.0500 time 0.5700 (0.7568) data time 0.0008 (0.0109) model time 0.5692 (0.5768) loss 7.2231 (7.2395) grad_norm 3.1544 (2.5331) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:24:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][100/625] eta 0:06:23 lr 0.000158 wd 0.0500 time 0.5717 (0.7308) data time 0.0008 (0.0095) model time 0.5709 (0.5756) loss 7.3707 (7.2047) grad_norm 1.4874 (2.4540) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:25:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][110/625] eta 0:06:06 lr 0.000158 wd 0.0500 time 0.5723 (0.7113) data time 0.0008 (0.0084) model time 0.5715 (0.5750) loss 6.8584 (7.2240) grad_norm 2.1114 (2.4584) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:25:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][120/625] eta 0:05:51 lr 0.000158 wd 0.0500 time 0.5736 (0.6960) data time 0.0006 (0.0076) model time 0.5731 (0.5745) loss 7.4031 (7.2077) grad_norm 2.6612 (2.4479) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:25:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][130/625] eta 0:05:38 lr 0.000158 wd 0.0500 time 0.5768 (0.6838) data time 0.0006 (0.0069) model time 0.5762 (0.5742) loss 7.4187 (7.2126) grad_norm 2.6927 (2.4616) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:25:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][140/625] eta 0:05:26 lr 0.000158 wd 0.0500 time 0.5701 (0.6736) data time 0.0008 (0.0064) model time 0.5692 (0.5737) loss 6.7274 (7.2276) grad_norm 2.4921 (2.4648) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:25:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][150/625] eta 0:05:15 lr 0.000158 wd 0.0500 time 0.5673 (0.6652) data time 0.0007 (0.0059) model time 0.5666 (0.5735) loss 6.8490 (7.2552) grad_norm 2.8268 (2.4546) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:25:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][160/625] eta 0:05:05 lr 0.000158 wd 0.0500 time 0.5699 (0.6580) data time 0.0007 (0.0055) model time 0.5691 (0.5731) loss 8.1132 (7.2405) grad_norm 2.1951 (2.4774) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:25:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][170/625] eta 0:04:56 lr 0.000157 wd 0.0500 time 0.5746 (0.6520) data time 0.0006 (0.0052) model time 0.5739 (0.5731) loss 6.6432 (7.2086) grad_norm 2.6961 (2.7440) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:25:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][180/625] eta 0:04:47 lr 0.000157 wd 0.0500 time 0.5725 (0.6469) data time 0.0008 (0.0049) model time 0.5717 (0.5732) loss 6.4973 (7.2023) grad_norm 2.7524 (2.7377) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:25:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][190/625] eta 0:04:39 lr 0.000157 wd 0.0500 time 0.5736 (0.6424) data time 0.0009 (0.0046) model time 0.5727 (0.5732) loss 7.6690 (7.2063) grad_norm 2.4635 (2.7251) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:25:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][200/625] eta 0:04:31 lr 0.000157 wd 0.0500 time 0.5709 (0.6383) data time 0.0007 (0.0044) model time 0.5701 (0.5731) loss 6.6690 (7.2068) grad_norm 3.6831 (2.7908) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:26:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][210/625] eta 0:04:23 lr 0.000157 wd 0.0500 time 0.5705 (0.6346) data time 0.0008 (0.0042) model time 0.5697 (0.5729) loss 8.6826 (7.2046) grad_norm 4.6176 (2.8359) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:26:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][220/625] eta 0:04:15 lr 0.000157 wd 0.0500 time 0.5718 (0.6313) data time 0.0008 (0.0040) model time 0.5710 (0.5728) loss 6.0999 (7.1920) grad_norm 2.9890 (2.9107) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:26:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][230/625] eta 0:04:08 lr 0.000157 wd 0.0500 time 0.5703 (0.6283) data time 0.0009 (0.0039) model time 0.5694 (0.5727) loss 6.6609 (7.1674) grad_norm 2.2136 (2.8940) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:26:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][240/625] eta 0:04:00 lr 0.000157 wd 0.0500 time 0.5721 (0.6257) data time 0.0008 (0.0037) model time 0.5713 (0.5726) loss 7.8417 (7.1494) grad_norm 2.2843 (2.8968) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:26:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][250/625] eta 0:03:53 lr 0.000157 wd 0.0500 time 0.5741 (0.6234) data time 0.0006 (0.0036) model time 0.5735 (0.5727) loss 7.4450 (7.1420) grad_norm 4.5863 (2.9249) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:26:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][260/625] eta 0:03:46 lr 0.000157 wd 0.0500 time 0.5736 (0.6213) data time 0.0006 (0.0035) model time 0.5730 (0.5728) loss 6.4365 (7.1513) grad_norm 2.9041 (2.9105) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:26:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][270/625] eta 0:03:39 lr 0.000157 wd 0.0500 time 0.5749 (0.6194) data time 0.0006 (0.0034) model time 0.5743 (0.5729) loss 6.3766 (7.1464) grad_norm 2.0413 (2.9060) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:26:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][280/625] eta 0:03:33 lr 0.000157 wd 0.0500 time 0.5711 (0.6176) data time 0.0006 (0.0033) model time 0.5705 (0.5728) loss 6.9047 (7.1380) grad_norm 2.6540 (2.8816) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:26:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][290/625] eta 0:03:26 lr 0.000157 wd 0.0500 time 0.5717 (0.6159) data time 0.0006 (0.0032) model time 0.5711 (0.5728) loss 7.1843 (7.1193) grad_norm 3.3747 (2.9342) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:26:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][300/625] eta 0:03:19 lr 0.000157 wd 0.0500 time 0.5707 (0.6143) data time 0.0006 (0.0031) model time 0.5701 (0.5728) loss 8.4800 (7.1159) grad_norm 2.0023 (2.9183) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:26:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][310/625] eta 0:03:13 lr 0.000157 wd 0.0500 time 0.5737 (0.6139) data time 0.0008 (0.0030) model time 0.5729 (0.5739) loss 7.1191 (7.1245) grad_norm 3.4912 (2.9275) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:27:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][320/625] eta 0:03:06 lr 0.000156 wd 0.0500 time 0.5747 (0.6125) data time 0.0006 (0.0029) model time 0.5741 (0.5739) loss 5.9599 (7.1236) grad_norm 2.4700 (2.9003) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:27:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][330/625] eta 0:03:00 lr 0.000156 wd 0.0500 time 0.5737 (0.6112) data time 0.0008 (0.0029) model time 0.5729 (0.5739) loss 6.3227 (7.1127) grad_norm 2.0289 (2.8941) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:27:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][340/625] eta 0:02:53 lr 0.000156 wd 0.0500 time 0.5798 (0.6101) data time 0.0008 (0.0028) model time 0.5790 (0.5739) loss 7.7600 (7.1083) grad_norm 2.6793 (2.8907) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:27:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][350/625] eta 0:02:47 lr 0.000156 wd 0.0500 time 0.5698 (0.6090) data time 0.0006 (0.0028) model time 0.5692 (0.5739) loss 7.1399 (7.1208) grad_norm 1.7272 (2.8732) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:27:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][360/625] eta 0:02:41 lr 0.000156 wd 0.0500 time 0.5732 (0.6079) data time 0.0006 (0.0027) model time 0.5726 (0.5738) loss 6.7290 (7.1246) grad_norm 2.5435 (2.8548) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:27:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][370/625] eta 0:02:34 lr 0.000156 wd 0.0500 time 0.5776 (0.6069) data time 0.0009 (0.0026) model time 0.5767 (0.5738) loss 8.2279 (7.1308) grad_norm 2.1408 (2.8431) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:27:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][380/625] eta 0:02:28 lr 0.000156 wd 0.0500 time 0.5735 (0.6060) data time 0.0006 (0.0026) model time 0.5729 (0.5738) loss 7.4558 (7.1321) grad_norm 1.9539 (2.8256) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:27:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][390/625] eta 0:02:22 lr 0.000156 wd 0.0500 time 0.5722 (0.6051) data time 0.0006 (0.0025) model time 0.5715 (0.5738) loss 6.3799 (7.1281) grad_norm 2.2371 (2.8103) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:27:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-27 22:27:45 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 22:27:48 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 22:29:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 22:29:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 22:29:59 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 22:30:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-27 22:30:14 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-27 22:30:15 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-27 22:30:15 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-27 22:30:15 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 236) +[2024-07-27 22:30:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-27 22:30:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][400/625] eta 0:06:26 lr 0.000156 wd 0.0500 time 0.5711 (1.7195) data time 0.0006 (0.1292) model time 0.5704 (1.5903) loss 7.2087 (7.3170) grad_norm 2.3511 (2.2960) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:30:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][410/625] eta 0:03:52 lr 0.000156 wd 0.0500 time 0.5699 (1.0814) data time 0.0007 (0.0580) model time 0.5692 (1.0234) loss 7.1674 (7.3451) grad_norm 1.7015 (2.2123) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:30:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][420/625] eta 0:03:04 lr 0.000156 wd 0.0500 time 0.5732 (0.8996) data time 0.0009 (0.0376) model time 0.5723 (0.8620) loss 8.1525 (7.3721) grad_norm 2.6842 (2.1892) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:30:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][430/625] eta 0:02:38 lr 0.000156 wd 0.0500 time 0.5758 (0.8137) data time 0.0009 (0.0279) model time 0.5749 (0.7858) loss 7.1555 (7.2879) grad_norm 2.1629 (2.5975) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:30:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][440/625] eta 0:02:21 lr 0.000156 wd 0.0500 time 0.5770 (0.7636) data time 0.0007 (0.0223) model time 0.5763 (0.7414) loss 7.8680 (7.2802) grad_norm 2.0464 (2.5978) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:31:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][450/625] eta 0:02:08 lr 0.000156 wd 0.0500 time 0.5762 (0.7365) data time 0.0006 (0.0186) model time 0.5756 (0.7179) loss 5.9795 (7.2165) grad_norm 2.3956 (2.6881) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:31:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][460/625] eta 0:01:57 lr 0.000155 wd 0.0500 time 0.5853 (0.7135) data time 0.0006 (0.0160) model time 0.5847 (0.6975) loss 6.4813 (7.1692) grad_norm 1.6478 (2.6686) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:31:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][470/625] eta 0:01:47 lr 0.000155 wd 0.0500 time 0.5781 (0.6956) data time 0.0007 (0.0141) model time 0.5774 (0.6815) loss 6.7016 (7.1583) grad_norm 4.2200 (2.7325) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:31:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][480/625] eta 0:01:38 lr 0.000155 wd 0.0500 time 0.5751 (0.6821) data time 0.0009 (0.0126) model time 0.5742 (0.6695) loss 8.5529 (7.1611) grad_norm 2.4632 (2.8977) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:31:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][490/625] eta 0:01:30 lr 0.000155 wd 0.0500 time 0.5772 (0.6713) data time 0.0006 (0.0114) model time 0.5766 (0.6599) loss 7.7085 (7.1543) grad_norm 2.0563 (2.8454) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:31:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][500/625] eta 0:01:22 lr 0.000155 wd 0.0500 time 0.5763 (0.6625) data time 0.0007 (0.0104) model time 0.5756 (0.6520) loss 6.3088 (7.1622) grad_norm 2.1883 (2.8387) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:31:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][510/625] eta 0:01:15 lr 0.000155 wd 0.0500 time 0.5755 (0.6552) data time 0.0008 (0.0096) model time 0.5746 (0.6456) loss 7.5543 (7.1613) grad_norm 2.8216 (2.8594) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:31:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][520/625] eta 0:01:08 lr 0.000155 wd 0.0500 time 0.5784 (0.6491) data time 0.0006 (0.0089) model time 0.5778 (0.6402) loss 7.1380 (7.1517) grad_norm 2.0479 (2.8082) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:31:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][530/625] eta 0:01:01 lr 0.000155 wd 0.0500 time 0.5761 (0.6439) data time 0.0008 (0.0084) model time 0.5753 (0.6355) loss 6.9813 (7.1500) grad_norm 1.9247 (2.7810) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:31:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][540/625] eta 0:00:54 lr 0.000155 wd 0.0500 time 0.5740 (0.6393) data time 0.0007 (0.0079) model time 0.5733 (0.6314) loss 6.6755 (7.1368) grad_norm 1.9023 (2.7508) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:32:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][550/625] eta 0:00:47 lr 0.000155 wd 0.0500 time 0.5777 (0.6353) data time 0.0007 (0.0074) model time 0.5770 (0.6279) loss 6.0220 (7.1255) grad_norm 1.9473 (2.7564) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:32:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][560/625] eta 0:00:41 lr 0.000155 wd 0.0500 time 0.5820 (0.6317) data time 0.0009 (0.0070) model time 0.5811 (0.6247) loss 8.7011 (7.1447) grad_norm 2.0814 (2.7404) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:32:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][570/625] eta 0:00:34 lr 0.000155 wd 0.0500 time 0.5772 (0.6285) data time 0.0006 (0.0067) model time 0.5765 (0.6219) loss 6.4158 (7.1288) grad_norm 5.1510 (2.7369) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:32:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][580/625] eta 0:00:28 lr 0.000155 wd 0.0500 time 0.5782 (0.6258) data time 0.0007 (0.0064) model time 0.5775 (0.6194) loss 6.8792 (7.1310) grad_norm 2.7346 (2.7202) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:32:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][590/625] eta 0:00:21 lr 0.000155 wd 0.0500 time 0.5779 (0.6233) data time 0.0010 (0.0061) model time 0.5769 (0.6172) loss 6.4226 (7.1214) grad_norm 1.9289 (2.6975) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:32:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][600/625] eta 0:00:15 lr 0.000154 wd 0.0500 time 0.5771 (0.6211) data time 0.0008 (0.0059) model time 0.5763 (0.6152) loss 7.3950 (7.1067) grad_norm 5.3153 (2.6971) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:32:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][610/625] eta 0:00:09 lr 0.000154 wd 0.0500 time 0.5740 (0.6191) data time 0.0007 (0.0056) model time 0.5733 (0.6135) loss 7.1209 (7.0889) grad_norm 3.9249 (2.6863) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:32:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [236/300][620/625] eta 0:00:03 lr 0.000154 wd 0.0500 time 0.5783 (0.6172) data time 0.0007 (0.0054) model time 0.5776 (0.6118) loss 8.5218 (7.0965) grad_norm 2.9540 (2.6954) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-27 22:32:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 236 training takes 0:02:23 +[2024-07-27 22:32:42 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 22:32:48 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 22:32:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.465 (0.465) Loss 0.4968 (0.4968) Acc@1 90.332 (90.332) Acc@5 98.926 (98.926) Mem 22341MB +[2024-07-27 22:32:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.156) Loss 0.7402 (0.6101) Acc@1 83.008 (87.842) Acc@5 96.973 (98.047) Mem 22341MB +[2024-07-27 22:32:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8452 (0.6970) Acc@1 80.127 (85.154) Acc@5 96.338 (97.280) Mem 22341MB +[2024-07-27 22:32:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.761 Acc@5 97.277 +[2024-07-27 22:32:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.8% +[2024-07-27 22:32:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.867 (0.867) Loss 0.5049 (0.5049) Acc@1 90.430 (90.430) Acc@5 98.975 (98.975) Mem 22341MB +[2024-07-27 22:32:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.193) Loss 0.7451 (0.6143) Acc@1 83.203 (87.900) Acc@5 96.924 (98.087) Mem 22341MB +[2024-07-27 22:32:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.161) Loss 0.8472 (0.7008) Acc@1 79.834 (85.077) Acc@5 96.240 (97.289) Mem 22341MB +[2024-07-27 22:32:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.669 Acc@5 97.283 +[2024-07-27 22:32:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.7% +[2024-07-27 22:32:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.67% +[2024-07-27 22:32:57 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-27 22:33:02 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-27 22:33:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][0/625] eta 0:15:06 lr 0.000154 wd 0.0500 time 1.4502 (1.4502) data time 0.3737 (0.3737) model time 0.0000 (0.0000) loss 6.7395 (6.7395) grad_norm 9.2312 (9.2312) loss_scale 256.0000 (256.0000) mem 22337MB +[2024-07-27 22:33:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][10/625] eta 0:06:45 lr 0.000154 wd 0.0500 time 0.5828 (0.6587) data time 0.0009 (0.0348) model time 0.0000 (0.0000) loss 8.7061 (7.2798) grad_norm 1.7485 (3.3268) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:33:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][20/625] eta 0:06:15 lr 0.000154 wd 0.0500 time 0.5835 (0.6201) data time 0.0010 (0.0187) model time 0.0000 (0.0000) loss 7.3259 (7.0201) grad_norm 3.6601 (2.8903) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:33:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][30/625] eta 0:05:59 lr 0.000154 wd 0.0500 time 0.5736 (0.6050) data time 0.0007 (0.0129) model time 0.0000 (0.0000) loss 6.2994 (6.8934) grad_norm 2.2790 (2.8255) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:33:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][40/625] eta 0:05:49 lr 0.000154 wd 0.0500 time 0.5758 (0.5978) data time 0.0007 (0.0100) model time 0.0000 (0.0000) loss 7.4274 (6.9255) grad_norm 2.3154 (2.6974) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:33:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][50/625] eta 0:05:43 lr 0.000154 wd 0.0500 time 0.5748 (0.5974) data time 0.0008 (0.0082) model time 0.0000 (0.0000) loss 7.5366 (6.9936) grad_norm 2.0443 (2.7592) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:33:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][60/625] eta 0:05:35 lr 0.000154 wd 0.0500 time 0.5828 (0.5946) data time 0.0009 (0.0070) model time 0.5820 (0.5789) loss 6.5653 (7.0252) grad_norm 2.1741 (2.6823) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:33:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][70/625] eta 0:05:28 lr 0.000154 wd 0.0500 time 0.5800 (0.5925) data time 0.0007 (0.0062) model time 0.5793 (0.5791) loss 6.6685 (6.9812) grad_norm 2.3981 (2.5888) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:33:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][80/625] eta 0:05:22 lr 0.000154 wd 0.0500 time 0.5846 (0.5910) data time 0.0006 (0.0055) model time 0.5840 (0.5792) loss 7.3815 (7.0100) grad_norm 3.0606 (2.5540) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:33:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][90/625] eta 0:05:15 lr 0.000154 wd 0.0500 time 0.5817 (0.5898) data time 0.0008 (0.0050) model time 0.5809 (0.5792) loss 8.2365 (7.0678) grad_norm 2.3456 (2.5422) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:34:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][100/625] eta 0:05:09 lr 0.000154 wd 0.0500 time 0.5768 (0.5887) data time 0.0008 (0.0046) model time 0.5760 (0.5788) loss 7.3388 (7.0698) grad_norm 3.5320 (2.5489) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:34:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][110/625] eta 0:05:02 lr 0.000154 wd 0.0500 time 0.5795 (0.5877) data time 0.0009 (0.0042) model time 0.5786 (0.5786) loss 7.4187 (7.0992) grad_norm 2.6552 (2.5258) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:34:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][120/625] eta 0:04:56 lr 0.000153 wd 0.0500 time 0.5775 (0.5869) data time 0.0006 (0.0040) model time 0.5769 (0.5784) loss 7.1880 (7.0907) grad_norm 2.6406 (2.5084) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:34:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][130/625] eta 0:04:50 lr 0.000153 wd 0.0500 time 0.5798 (0.5862) data time 0.0006 (0.0037) model time 0.5792 (0.5782) loss 7.5125 (7.0786) grad_norm 2.3458 (2.5181) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:34:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][140/625] eta 0:04:44 lr 0.000153 wd 0.0500 time 0.5829 (0.5859) data time 0.0007 (0.0035) model time 0.5822 (0.5785) loss 7.7316 (7.0712) grad_norm 1.9973 (2.6327) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:34:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][150/625] eta 0:04:38 lr 0.000153 wd 0.0500 time 0.5835 (0.5856) data time 0.0008 (0.0033) model time 0.5826 (0.5787) loss 6.2225 (7.0559) grad_norm 3.6780 (2.6222) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:34:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][160/625] eta 0:04:32 lr 0.000153 wd 0.0500 time 0.5782 (0.5853) data time 0.0009 (0.0032) model time 0.5772 (0.5788) loss 7.3576 (7.0560) grad_norm 2.9863 (2.6326) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:34:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][170/625] eta 0:04:26 lr 0.000153 wd 0.0500 time 0.5756 (0.5849) data time 0.0009 (0.0031) model time 0.5747 (0.5787) loss 6.5727 (7.0626) grad_norm 2.9235 (2.6415) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:34:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][180/625] eta 0:04:20 lr 0.000153 wd 0.0500 time 0.5782 (0.5844) data time 0.0009 (0.0029) model time 0.5772 (0.5784) loss 7.4355 (7.0687) grad_norm 2.2774 (2.6548) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:34:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][190/625] eta 0:04:14 lr 0.000153 wd 0.0500 time 0.5820 (0.5841) data time 0.0006 (0.0028) model time 0.5814 (0.5783) loss 6.0637 (7.0557) grad_norm 4.7971 (2.6607) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:34:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][200/625] eta 0:04:08 lr 0.000153 wd 0.0500 time 0.5760 (0.5837) data time 0.0007 (0.0027) model time 0.5754 (0.5782) loss 8.2780 (7.0819) grad_norm 3.0657 (2.7536) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:35:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][210/625] eta 0:04:02 lr 0.000153 wd 0.0500 time 0.5806 (0.5842) data time 0.0006 (0.0026) model time 0.5800 (0.5792) loss 6.4658 (7.0999) grad_norm 1.5157 (2.7186) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:35:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][220/625] eta 0:03:56 lr 0.000153 wd 0.0500 time 0.5779 (0.5841) data time 0.0006 (0.0026) model time 0.5772 (0.5792) loss 5.5539 (7.0756) grad_norm 1.7215 (2.7320) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:35:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][230/625] eta 0:03:50 lr 0.000153 wd 0.0500 time 0.5783 (0.5839) data time 0.0006 (0.0025) model time 0.5777 (0.5792) loss 5.5109 (7.0614) grad_norm 3.2295 (2.7255) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:35:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][240/625] eta 0:03:44 lr 0.000153 wd 0.0500 time 0.5780 (0.5838) data time 0.0006 (0.0024) model time 0.5773 (0.5792) loss 6.9479 (7.0497) grad_norm 2.9152 (2.7941) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:35:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][250/625] eta 0:03:38 lr 0.000153 wd 0.0500 time 0.5783 (0.5836) data time 0.0006 (0.0024) model time 0.5777 (0.5792) loss 7.8079 (7.0503) grad_norm 2.2429 (2.7918) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:35:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][260/625] eta 0:03:32 lr 0.000153 wd 0.0500 time 0.5767 (0.5834) data time 0.0008 (0.0023) model time 0.5759 (0.5791) loss 6.2404 (7.0549) grad_norm 2.0461 (2.7865) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:35:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][270/625] eta 0:03:27 lr 0.000152 wd 0.0500 time 0.5759 (0.5836) data time 0.0006 (0.0022) model time 0.5753 (0.5796) loss 6.4495 (7.0465) grad_norm 2.1460 (2.7725) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:35:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][280/625] eta 0:03:21 lr 0.000152 wd 0.0500 time 0.5805 (0.5834) data time 0.0006 (0.0022) model time 0.5798 (0.5795) loss 8.0658 (7.0743) grad_norm 3.6524 (2.7623) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:35:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][290/625] eta 0:03:15 lr 0.000152 wd 0.0500 time 0.5906 (0.5833) data time 0.0009 (0.0022) model time 0.5898 (0.5795) loss 7.2968 (7.0658) grad_norm 2.2012 (2.7572) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 22:35:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-27 22:35:55 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 22:35:57 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 22:46:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 22:46:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 22:47:10 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 22:47:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-27 22:47:22 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-27 22:47:23 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-27 22:47:23 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-27 22:47:23 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 237) +[2024-07-27 22:47:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-27 22:47:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][300/625] eta 0:15:27 lr 0.000152 wd 0.0500 time 0.5634 (2.8525) data time 0.0008 (0.1984) model time 0.5627 (2.6541) loss 7.4489 (7.1987) grad_norm 3.0958 (2.2744) loss_scale 256.0000 (256.0000) mem 22343MB +[2024-07-27 22:47:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][310/625] eta 0:06:24 lr 0.000152 wd 0.0500 time 0.5660 (1.2191) data time 0.0006 (0.0574) model time 0.5654 (1.1617) loss 7.6086 (7.3410) grad_norm 2.4342 (2.2783) loss_scale 256.0000 (256.0000) mem 22343MB +[2024-07-27 22:47:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][320/625] eta 0:04:48 lr 0.000152 wd 0.0500 time 0.5672 (0.9470) data time 0.0009 (0.0339) model time 0.5663 (0.9131) loss 7.7965 (7.3936) grad_norm 2.7922 (2.4114) loss_scale 256.0000 (256.0000) mem 22343MB +[2024-07-27 22:47:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][330/625] eta 0:04:06 lr 0.000152 wd 0.0500 time 0.5632 (0.8347) data time 0.0006 (0.0241) model time 0.5626 (0.8106) loss 7.1415 (7.3962) grad_norm 6.0561 (2.4600) loss_scale 256.0000 (256.0000) mem 22343MB +[2024-07-27 22:48:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][340/625] eta 0:03:40 lr 0.000152 wd 0.0500 time 0.5712 (0.7739) data time 0.0007 (0.0189) model time 0.5705 (0.7550) loss 6.7031 (7.2952) grad_norm 2.1510 (2.4431) loss_scale 256.0000 (256.0000) mem 22343MB +[2024-07-27 22:48:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][350/625] eta 0:03:22 lr 0.000152 wd 0.0500 time 0.5662 (0.7379) data time 0.0007 (0.0156) model time 0.5655 (0.7223) loss 7.5625 (7.2789) grad_norm 3.3823 (2.5025) loss_scale 256.0000 (256.0000) mem 22343MB +[2024-07-27 22:48:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][360/625] eta 0:03:09 lr 0.000152 wd 0.0500 time 0.5754 (0.7152) data time 0.0006 (0.0133) model time 0.5747 (0.7019) loss 7.0667 (7.1992) grad_norm 2.9094 (2.5501) loss_scale 256.0000 (256.0000) mem 22343MB +[2024-07-27 22:48:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][370/625] eta 0:02:57 lr 0.000152 wd 0.0500 time 0.5721 (0.6957) data time 0.0006 (0.0116) model time 0.5715 (0.6841) loss 7.8296 (7.2052) grad_norm 2.7131 (2.7524) loss_scale 256.0000 (256.0000) mem 22343MB +[2024-07-27 22:48:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][380/625] eta 0:02:46 lr 0.000152 wd 0.0500 time 0.5721 (0.6810) data time 0.0009 (0.0103) model time 0.5712 (0.6707) loss 7.6622 (7.1948) grad_norm 3.1507 (2.7277) loss_scale 256.0000 (256.0000) mem 22343MB +[2024-07-27 22:48:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][390/625] eta 0:02:37 lr 0.000152 wd 0.0500 time 0.5738 (0.6694) data time 0.0010 (0.0093) model time 0.5728 (0.6601) loss 6.2347 (7.1764) grad_norm 2.0848 (2.6808) loss_scale 256.0000 (256.0000) mem 22343MB +[2024-07-27 22:48:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][400/625] eta 0:02:28 lr 0.000152 wd 0.0500 time 0.5701 (0.6600) data time 0.0008 (0.0085) model time 0.5693 (0.6515) loss 7.0490 (7.1974) grad_norm 2.5280 (2.6618) loss_scale 256.0000 (256.0000) mem 22343MB +[2024-07-27 22:48:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][410/625] eta 0:02:20 lr 0.000151 wd 0.0500 time 0.5771 (0.6523) data time 0.0009 (0.0079) model time 0.5762 (0.6444) loss 7.9709 (7.1873) grad_norm 2.1388 (2.6259) loss_scale 256.0000 (256.0000) mem 22343MB +[2024-07-27 22:48:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][420/625] eta 0:02:12 lr 0.000151 wd 0.0500 time 0.5723 (0.6459) data time 0.0009 (0.0073) model time 0.5715 (0.6386) loss 6.2076 (7.1813) grad_norm 2.3317 (2.5856) loss_scale 256.0000 (256.0000) mem 22343MB +[2024-07-27 22:48:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][430/625] eta 0:02:04 lr 0.000151 wd 0.0500 time 0.5736 (0.6404) data time 0.0009 (0.0068) model time 0.5727 (0.6335) loss 7.6001 (7.1825) grad_norm 1.9284 (2.5831) loss_scale 256.0000 (256.0000) mem 22343MB +[2024-07-27 22:48:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][440/625] eta 0:01:57 lr 0.000151 wd 0.0500 time 0.5743 (0.6357) data time 0.0007 (0.0064) model time 0.5736 (0.6293) loss 6.4962 (7.1715) grad_norm 2.2001 (inf) loss_scale 128.0000 (248.8889) mem 22343MB +[2024-07-27 22:49:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][450/625] eta 0:01:50 lr 0.000151 wd 0.0500 time 0.5764 (0.6318) data time 0.0006 (0.0061) model time 0.5757 (0.6257) loss 6.8847 (7.1628) grad_norm 3.1507 (inf) loss_scale 128.0000 (241.0390) mem 22343MB +[2024-07-27 22:49:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][460/625] eta 0:01:43 lr 0.000151 wd 0.0500 time 0.5746 (0.6284) data time 0.0007 (0.0058) model time 0.5740 (0.6227) loss 6.5306 (7.1663) grad_norm 2.1113 (inf) loss_scale 128.0000 (234.1463) mem 22343MB +[2024-07-27 22:49:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][470/625] eta 0:01:36 lr 0.000151 wd 0.0500 time 0.5772 (0.6253) data time 0.0008 (0.0055) model time 0.5764 (0.6198) loss 6.5609 (7.1670) grad_norm 1.8516 (inf) loss_scale 128.0000 (228.0460) mem 22343MB +[2024-07-27 22:49:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][480/625] eta 0:01:30 lr 0.000151 wd 0.0500 time 0.5763 (0.6224) data time 0.0006 (0.0052) model time 0.5757 (0.6172) loss 6.3226 (7.1649) grad_norm 2.8656 (inf) loss_scale 128.0000 (222.6087) mem 22343MB +[2024-07-27 22:49:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][490/625] eta 0:01:23 lr 0.000151 wd 0.0500 time 0.5700 (0.6198) data time 0.0006 (0.0050) model time 0.5694 (0.6148) loss 6.5176 (7.1484) grad_norm 1.9106 (inf) loss_scale 128.0000 (217.7320) mem 22343MB +[2024-07-27 22:49:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][500/625] eta 0:01:17 lr 0.000151 wd 0.0500 time 0.5841 (0.6175) data time 0.0008 (0.0048) model time 0.5833 (0.6127) loss 7.2786 (7.1192) grad_norm 3.5433 (inf) loss_scale 128.0000 (213.3333) mem 22343MB +[2024-07-27 22:49:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][510/625] eta 0:01:10 lr 0.000151 wd 0.0500 time 0.5825 (0.6155) data time 0.0006 (0.0046) model time 0.5819 (0.6109) loss 7.4611 (7.1170) grad_norm 2.0352 (inf) loss_scale 128.0000 (209.3458) mem 22343MB +[2024-07-27 22:49:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][520/625] eta 0:01:04 lr 0.000151 wd 0.0500 time 0.5742 (0.6136) data time 0.0008 (0.0044) model time 0.5734 (0.6092) loss 8.1713 (7.1234) grad_norm 2.7140 (inf) loss_scale 128.0000 (205.7143) mem 22343MB +[2024-07-27 22:49:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][530/625] eta 0:00:58 lr 0.000151 wd 0.0500 time 0.5733 (0.6119) data time 0.0006 (0.0043) model time 0.5727 (0.6076) loss 6.1214 (7.1192) grad_norm 2.9912 (inf) loss_scale 128.0000 (202.3932) mem 22343MB +[2024-07-27 22:49:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][540/625] eta 0:00:51 lr 0.000151 wd 0.0500 time 0.5702 (0.6103) data time 0.0006 (0.0041) model time 0.5695 (0.6061) loss 5.4560 (7.1211) grad_norm 2.3931 (inf) loss_scale 128.0000 (199.3443) mem 22343MB +[2024-07-27 22:50:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][550/625] eta 0:00:45 lr 0.000151 wd 0.0500 time 0.5683 (0.6087) data time 0.0006 (0.0040) model time 0.5677 (0.6047) loss 6.8782 (7.1236) grad_norm 2.2343 (inf) loss_scale 128.0000 (196.5354) mem 22343MB +[2024-07-27 22:50:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][560/625] eta 0:00:39 lr 0.000150 wd 0.0500 time 0.5695 (0.6072) data time 0.0006 (0.0039) model time 0.5689 (0.6033) loss 7.2813 (7.1090) grad_norm 2.3967 (inf) loss_scale 128.0000 (193.9394) mem 22343MB +[2024-07-27 22:50:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][570/625] eta 0:00:33 lr 0.000150 wd 0.0500 time 0.7765 (0.6069) data time 0.0009 (0.0038) model time 0.7756 (0.6031) loss 8.2526 (7.0994) grad_norm 2.0226 (inf) loss_scale 128.0000 (191.5328) mem 22343MB +[2024-07-27 22:50:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][580/625] eta 0:00:27 lr 0.000150 wd 0.0500 time 0.5718 (0.6056) data time 0.0006 (0.0037) model time 0.5712 (0.6019) loss 5.9806 (7.0997) grad_norm 1.5473 (inf) loss_scale 128.0000 (189.2958) mem 22343MB +[2024-07-27 22:50:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][590/625] eta 0:00:21 lr 0.000150 wd 0.0500 time 0.5728 (0.6044) data time 0.0006 (0.0036) model time 0.5722 (0.6009) loss 6.0979 (7.0939) grad_norm 3.0718 (inf) loss_scale 128.0000 (187.2109) mem 22343MB +[2024-07-27 22:50:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][600/625] eta 0:00:15 lr 0.000150 wd 0.0500 time 0.5735 (0.6034) data time 0.0009 (0.0035) model time 0.5726 (0.5999) loss 7.4905 (7.0837) grad_norm 3.8660 (inf) loss_scale 128.0000 (185.2632) mem 22343MB +[2024-07-27 22:50:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][610/625] eta 0:00:09 lr 0.000150 wd 0.0500 time 0.5721 (0.6024) data time 0.0006 (0.0034) model time 0.5715 (0.5990) loss 7.4658 (7.0913) grad_norm 1.8180 (inf) loss_scale 128.0000 (183.4395) mem 22343MB +[2024-07-27 22:50:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [237/300][620/625] eta 0:00:03 lr 0.000150 wd 0.0500 time 0.5705 (0.6014) data time 0.0004 (0.0033) model time 0.5701 (0.5981) loss 7.6704 (7.1040) grad_norm 2.1728 (inf) loss_scale 128.0000 (181.7284) mem 22343MB +[2024-07-27 22:50:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 237 training takes 0:03:17 +[2024-07-27 22:50:45 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 22:50:50 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 22:50:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.468 (0.468) Loss 0.4993 (0.4993) Acc@1 89.990 (89.990) Acc@5 98.926 (98.926) Mem 22343MB +[2024-07-27 22:50:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.7515 (0.6093) Acc@1 82.715 (87.837) Acc@5 97.119 (98.078) Mem 22343MB +[2024-07-27 22:50:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.142) Loss 0.8335 (0.6945) Acc@1 80.615 (85.259) Acc@5 96.436 (97.263) Mem 22343MB +[2024-07-27 22:50:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.891 Acc@5 97.285 +[2024-07-27 22:50:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.9% +[2024-07-27 22:50:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 84.89% +[2024-07-27 22:50:55 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-27 22:50:58 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-27 22:50:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.449 (0.449) Loss 0.5044 (0.5044) Acc@1 90.430 (90.430) Acc@5 98.975 (98.975) Mem 22343MB +[2024-07-27 22:51:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.154) Loss 0.7456 (0.6138) Acc@1 83.252 (87.904) Acc@5 96.973 (98.105) Mem 22343MB +[2024-07-27 22:51:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.141) Loss 0.8462 (0.7003) Acc@1 79.785 (85.098) Acc@5 96.191 (97.294) Mem 22343MB +[2024-07-27 22:51:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.697 Acc@5 97.285 +[2024-07-27 22:51:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.7% +[2024-07-27 22:51:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.70% +[2024-07-27 22:51:02 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-27 22:51:03 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-27 22:51:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][0/625] eta 0:21:06 lr 0.000150 wd 0.0500 time 2.0260 (2.0260) data time 0.3513 (0.3513) model time 0.0000 (0.0000) loss 5.7515 (5.7515) grad_norm 3.4404 (3.4404) loss_scale 128.0000 (128.0000) mem 22337MB +[2024-07-27 22:51:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-27 22:51:11 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 22:51:12 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 22:55:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 22:55:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 22:55:49 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 22:55:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-27 22:55:59 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-27 22:55:59 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-27 22:55:59 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-27 22:56:00 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 238) +[2024-07-27 22:56:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-27 23:04:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 23:04:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 23:04:51 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 23:05:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-27 23:05:00 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-27 23:05:00 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-27 23:05:00 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-27 23:05:00 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 238) +[2024-07-27 23:05:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-27 23:05:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][10/625] eta 1:30:23 lr 0.000150 wd 0.0500 time 8.8185 (8.8185) data time 0.7355 (0.7355) model time 0.0000 (0.0000) loss 7.9568 (7.9568) grad_norm 2.3151 (2.3151) loss_scale 128.0000 (128.0000) mem 26016MB +[2024-07-27 23:05:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][20/625] eta 0:14:18 lr 0.000150 wd 0.0500 time 0.5760 (1.4185) data time 0.0009 (0.0679) model time 0.0000 (0.0000) loss 6.9287 (7.7383) grad_norm 2.1021 (2.3240) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:05:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][30/625] eta 0:10:05 lr 0.000150 wd 0.0500 time 0.5732 (1.0184) data time 0.0009 (0.0360) model time 0.0000 (0.0000) loss 7.1664 (7.5008) grad_norm 2.3861 (2.3467) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:05:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][40/625] eta 0:08:31 lr 0.000150 wd 0.0500 time 0.5757 (0.8749) data time 0.0006 (0.0247) model time 0.0000 (0.0000) loss 6.5073 (7.4528) grad_norm 1.8984 (2.3464) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:05:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][50/625] eta 0:07:40 lr 0.000150 wd 0.0500 time 0.5757 (0.8016) data time 0.0009 (0.0189) model time 0.0000 (0.0000) loss 7.3740 (7.3619) grad_norm 2.1352 (2.2614) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:05:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][60/625] eta 0:07:10 lr 0.000150 wd 0.0500 time 0.7674 (0.7611) data time 0.0007 (0.0154) model time 0.7668 (0.5941) loss 7.0704 (7.3092) grad_norm 3.1883 (2.3675) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:05:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][70/625] eta 0:06:47 lr 0.000150 wd 0.0500 time 0.5778 (0.7337) data time 0.0009 (0.0130) model time 0.5769 (0.5934) loss 7.6044 (7.2630) grad_norm 2.6113 (2.4107) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:05:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][80/625] eta 0:06:28 lr 0.000149 wd 0.0500 time 0.5789 (0.7121) data time 0.0009 (0.0113) model time 0.5780 (0.5887) loss 7.0670 (7.2496) grad_norm 2.1528 (2.4509) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:06:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][90/625] eta 0:06:12 lr 0.000149 wd 0.0500 time 0.5793 (0.6957) data time 0.0009 (0.0101) model time 0.5785 (0.5862) loss 7.1231 (7.2472) grad_norm 2.0017 (2.3964) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:06:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][100/625] eta 0:05:58 lr 0.000149 wd 0.0500 time 0.5810 (0.6831) data time 0.0006 (0.0091) model time 0.5804 (0.5849) loss 7.9824 (7.2560) grad_norm 1.8590 (2.3688) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:06:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][110/625] eta 0:05:46 lr 0.000149 wd 0.0500 time 0.5744 (0.6727) data time 0.0006 (0.0083) model time 0.5737 (0.5836) loss 8.4264 (7.2604) grad_norm 2.4742 (2.3864) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:06:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][120/625] eta 0:05:35 lr 0.000149 wd 0.0500 time 0.5783 (0.6641) data time 0.0009 (0.0076) model time 0.5774 (0.5826) loss 6.5797 (7.2614) grad_norm 3.4874 (2.4139) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:06:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][130/625] eta 0:05:25 lr 0.000149 wd 0.0500 time 0.5781 (0.6570) data time 0.0007 (0.0070) model time 0.5774 (0.5820) loss 5.8192 (7.2413) grad_norm 2.9220 (2.4146) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:06:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][140/625] eta 0:05:15 lr 0.000149 wd 0.0500 time 0.5756 (0.6511) data time 0.0009 (0.0066) model time 0.5747 (0.5815) loss 7.1207 (7.2157) grad_norm 2.9026 (2.4115) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:06:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][150/625] eta 0:05:06 lr 0.000149 wd 0.0500 time 0.5837 (0.6463) data time 0.0007 (0.0062) model time 0.5829 (0.5816) loss 7.2216 (7.1886) grad_norm 2.0018 (2.4093) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:06:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][160/625] eta 0:04:58 lr 0.000149 wd 0.0500 time 0.5899 (0.6421) data time 0.0009 (0.0059) model time 0.5891 (0.5816) loss 5.3109 (7.1626) grad_norm 1.7794 (2.4461) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:06:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][170/625] eta 0:04:50 lr 0.000149 wd 0.0500 time 0.5800 (0.6384) data time 0.0008 (0.0056) model time 0.5792 (0.5816) loss 8.2935 (7.1792) grad_norm 2.0613 (2.4769) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:06:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][180/625] eta 0:04:42 lr 0.000149 wd 0.0500 time 0.5789 (0.6351) data time 0.0010 (0.0053) model time 0.5779 (0.5816) loss 7.0435 (7.1693) grad_norm 2.3048 (2.4876) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:06:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][190/625] eta 0:04:34 lr 0.000149 wd 0.0500 time 0.5766 (0.6320) data time 0.0009 (0.0051) model time 0.5757 (0.5814) loss 8.5265 (7.1653) grad_norm 2.4796 (2.5566) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:07:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][200/625] eta 0:04:27 lr 0.000149 wd 0.0500 time 0.5822 (0.6293) data time 0.0008 (0.0048) model time 0.5814 (0.5812) loss 6.7318 (7.1571) grad_norm 2.4842 (2.5620) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:07:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][210/625] eta 0:04:20 lr 0.000149 wd 0.0500 time 0.5758 (0.6269) data time 0.0009 (0.0047) model time 0.5749 (0.5812) loss 6.8142 (7.1364) grad_norm 1.8080 (2.5532) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:07:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][220/625] eta 0:04:13 lr 0.000149 wd 0.0500 time 0.5830 (0.6248) data time 0.0009 (0.0045) model time 0.5821 (0.5812) loss 8.1008 (7.1257) grad_norm 2.6197 (2.5310) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:07:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][230/625] eta 0:04:06 lr 0.000148 wd 0.0500 time 0.5828 (0.6229) data time 0.0006 (0.0043) model time 0.5821 (0.5812) loss 6.7678 (7.1131) grad_norm 1.9867 (2.5528) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:07:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][240/625] eta 0:03:59 lr 0.000148 wd 0.0500 time 0.5833 (0.6212) data time 0.0006 (0.0042) model time 0.5827 (0.5812) loss 7.0962 (7.1226) grad_norm 1.9540 (2.5491) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:07:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][250/625] eta 0:03:52 lr 0.000148 wd 0.0500 time 0.5815 (0.6196) data time 0.0006 (0.0040) model time 0.5809 (0.5813) loss 6.3611 (7.1189) grad_norm 3.6823 (2.5570) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:07:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][260/625] eta 0:03:45 lr 0.000148 wd 0.0500 time 0.5790 (0.6180) data time 0.0006 (0.0039) model time 0.5784 (0.5812) loss 6.5313 (7.1142) grad_norm 2.1889 (2.5733) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:07:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][270/625] eta 0:03:38 lr 0.000148 wd 0.0500 time 0.5787 (0.6166) data time 0.0006 (0.0038) model time 0.5781 (0.5811) loss 7.4121 (7.1055) grad_norm 2.3001 (2.5774) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:07:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][280/625] eta 0:03:32 lr 0.000148 wd 0.0500 time 0.5787 (0.6152) data time 0.0006 (0.0037) model time 0.5781 (0.5810) loss 7.3800 (7.0881) grad_norm 2.1899 (2.6229) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:07:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][290/625] eta 0:03:26 lr 0.000148 wd 0.0500 time 0.5795 (0.6150) data time 0.0008 (0.0036) model time 0.5787 (0.5822) loss 7.5844 (7.0943) grad_norm 2.4779 (2.6203) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:08:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][300/625] eta 0:03:19 lr 0.000148 wd 0.0500 time 0.5808 (0.6139) data time 0.0006 (0.0035) model time 0.5802 (0.5822) loss 5.4062 (7.0902) grad_norm 2.2858 (2.6259) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:08:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][310/625] eta 0:03:13 lr 0.000148 wd 0.0500 time 0.5807 (0.6129) data time 0.0010 (0.0034) model time 0.5797 (0.5822) loss 7.4364 (7.0782) grad_norm 2.0727 (2.6237) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:08:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][320/625] eta 0:03:06 lr 0.000148 wd 0.0500 time 0.5789 (0.6120) data time 0.0009 (0.0033) model time 0.5780 (0.5822) loss 8.7215 (7.0864) grad_norm 2.7185 (2.6241) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:08:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][330/625] eta 0:03:00 lr 0.000148 wd 0.0500 time 0.5812 (0.6111) data time 0.0007 (0.0033) model time 0.5805 (0.5822) loss 7.5300 (7.1005) grad_norm 2.3362 (2.6113) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:08:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][340/625] eta 0:02:53 lr 0.000148 wd 0.0500 time 0.5821 (0.6101) data time 0.0007 (0.0032) model time 0.5814 (0.5821) loss 6.4712 (7.0997) grad_norm 11.4077 (2.6253) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:08:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][350/625] eta 0:02:47 lr 0.000148 wd 0.0500 time 0.5840 (0.6092) data time 0.0014 (0.0031) model time 0.5826 (0.5820) loss 9.2838 (7.1098) grad_norm 2.2828 (2.6257) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:08:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][360/625] eta 0:02:41 lr 0.000148 wd 0.0500 time 0.5797 (0.6084) data time 0.0006 (0.0031) model time 0.5791 (0.5819) loss 6.7343 (7.1047) grad_norm 4.5920 (2.6565) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:08:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][370/625] eta 0:02:34 lr 0.000148 wd 0.0500 time 0.5789 (0.6077) data time 0.0007 (0.0030) model time 0.5782 (0.5818) loss 6.5902 (7.1060) grad_norm 2.4005 (2.6556) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:08:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][380/625] eta 0:02:28 lr 0.000147 wd 0.0500 time 0.5812 (0.6070) data time 0.0008 (0.0030) model time 0.5803 (0.5818) loss 6.4289 (7.1025) grad_norm 1.7596 (2.6549) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:08:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][390/625] eta 0:02:22 lr 0.000147 wd 0.0500 time 0.5827 (0.6064) data time 0.0007 (0.0029) model time 0.5821 (0.5819) loss 5.3867 (7.0990) grad_norm 2.5195 (2.6641) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:09:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][400/625] eta 0:02:16 lr 0.000147 wd 0.0500 time 0.5896 (0.6058) data time 0.0007 (0.0029) model time 0.5889 (0.5819) loss 7.2620 (7.0926) grad_norm 2.1715 (2.6581) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:09:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][410/625] eta 0:02:10 lr 0.000147 wd 0.0500 time 0.5771 (0.6052) data time 0.0008 (0.0028) model time 0.5763 (0.5818) loss 6.6077 (7.0865) grad_norm 2.9905 (2.6697) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:09:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][420/625] eta 0:02:03 lr 0.000147 wd 0.0500 time 0.5806 (0.6045) data time 0.0010 (0.0028) model time 0.5796 (0.5817) loss 8.0305 (7.0896) grad_norm 3.4750 (2.6653) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:09:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][430/625] eta 0:01:57 lr 0.000147 wd 0.0500 time 0.5776 (0.6040) data time 0.0007 (0.0027) model time 0.5769 (0.5817) loss 8.3528 (7.0871) grad_norm 2.5729 (2.6610) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:09:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][440/625] eta 0:01:51 lr 0.000147 wd 0.0500 time 0.5802 (0.6035) data time 0.0008 (0.0027) model time 0.5794 (0.5817) loss 7.8667 (7.0917) grad_norm 2.2442 (2.6600) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:09:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][450/625] eta 0:01:45 lr 0.000147 wd 0.0500 time 0.5800 (0.6030) data time 0.0007 (0.0026) model time 0.5793 (0.5817) loss 7.3325 (7.0951) grad_norm 3.8039 (2.6636) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:09:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][460/625] eta 0:01:39 lr 0.000147 wd 0.0500 time 0.5826 (0.6026) data time 0.0007 (0.0026) model time 0.5819 (0.5817) loss 6.6116 (7.0891) grad_norm 2.9151 (2.6605) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:09:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][470/625] eta 0:01:33 lr 0.000147 wd 0.0500 time 0.5757 (0.6022) data time 0.0009 (0.0026) model time 0.5748 (0.5818) loss 7.8836 (7.0855) grad_norm 3.6426 (2.6635) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:09:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][480/625] eta 0:01:27 lr 0.000147 wd 0.0500 time 0.5772 (0.6017) data time 0.0008 (0.0026) model time 0.5763 (0.5817) loss 7.3205 (7.0713) grad_norm 1.9775 (2.6554) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:09:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][490/625] eta 0:01:21 lr 0.000147 wd 0.0500 time 0.5817 (0.6013) data time 0.0007 (0.0025) model time 0.5810 (0.5816) loss 6.4944 (7.0698) grad_norm 2.5122 (2.6697) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:10:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][500/625] eta 0:01:15 lr 0.000147 wd 0.0500 time 0.5218 (0.6012) data time 0.0010 (0.0025) model time 0.5208 (0.5819) loss 7.1962 (7.0682) grad_norm 2.3270 (2.6861) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:10:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][510/625] eta 0:01:09 lr 0.000147 wd 0.0500 time 0.5795 (0.6011) data time 0.0009 (0.0025) model time 0.5787 (0.5822) loss 7.8839 (7.0692) grad_norm 2.7036 (2.6912) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:10:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-27 23:10:11 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 23:10:14 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 23:12:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 23:12:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 23:14:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 23:15:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 23:15:25 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 23:15:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-27 23:15:36 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-27 23:15:36 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-27 23:15:36 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-27 23:15:36 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 238) +[2024-07-27 23:15:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-27 23:15:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][520/625] eta 0:09:15 lr 0.000146 wd 0.0500 time 1.6391 (5.2934) data time 0.0008 (0.3680) model time 1.6383 (4.9254) loss 8.1557 (8.0877) grad_norm 2.6978 (2.2708) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:15:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][530/625] eta 0:02:08 lr 0.000146 wd 0.0500 time 0.5684 (1.3554) data time 0.0007 (0.0621) model time 0.5678 (1.2933) loss 6.5987 (7.3653) grad_norm 2.4161 (2.7414) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-27 23:16:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][540/625] eta 0:01:24 lr 0.000146 wd 0.0500 time 0.5668 (0.9985) data time 0.0009 (0.0343) model time 0.5658 (0.9642) loss 7.4614 (7.3116) grad_norm 2.2963 (2.6111) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-27 23:16:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][550/625] eta 0:01:04 lr 0.000146 wd 0.0500 time 0.5800 (0.8645) data time 0.0007 (0.0238) model time 0.5793 (0.8406) loss 7.1638 (7.3065) grad_norm 1.8878 (2.6440) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-27 23:16:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][560/625] eta 0:00:51 lr 0.000146 wd 0.0500 time 0.5814 (0.7947) data time 0.0010 (0.0184) model time 0.5804 (0.7763) loss 7.4905 (7.2360) grad_norm 1.7692 (2.5644) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-27 23:16:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][570/625] eta 0:00:41 lr 0.000146 wd 0.0500 time 0.5235 (0.7548) data time 0.0008 (0.0150) model time 0.5227 (0.7398) loss 7.3651 (7.2349) grad_norm 1.9658 (2.5526) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-27 23:16:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][580/625] eta 0:00:32 lr 0.000146 wd 0.0500 time 0.5765 (0.7291) data time 0.0006 (0.0129) model time 0.5759 (0.7162) loss 7.1388 (7.1973) grad_norm 1.6829 (2.4844) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-27 23:16:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-27 23:16:27 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 23:16:32 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 23:18:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 23:18:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 23:19:13 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 23:19:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-27 23:19:25 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-27 23:19:26 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-27 23:19:26 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-27 23:19:26 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 238) +[2024-07-27 23:19:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-27 23:19:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][590/625] eta 0:01:17 lr 0.000146 wd 0.0500 time 0.5165 (2.2035) data time 0.0007 (0.1309) model time 0.5158 (2.0726) loss 6.8481 (7.3299) grad_norm 2.1912 (2.0952) loss_scale 128.0000 (128.0000) mem 22343MB +[2024-07-27 23:19:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][600/625] eta 0:00:31 lr 0.000146 wd 0.0500 time 0.5174 (1.2713) data time 0.0007 (0.0588) model time 0.5166 (1.2125) loss 7.5798 (7.1697) grad_norm 5.6034 (2.6223) loss_scale 128.0000 (128.0000) mem 22343MB +[2024-07-27 23:19:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][610/625] eta 0:00:15 lr 0.000146 wd 0.0500 time 0.5256 (1.0075) data time 0.0010 (0.0383) model time 0.5245 (0.9692) loss 7.7447 (7.2355) grad_norm 2.6768 (2.5699) loss_scale 128.0000 (128.0000) mem 22343MB +[2024-07-27 23:20:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [238/300][620/625] eta 0:00:04 lr 0.000146 wd 0.0500 time 0.5445 (0.8797) data time 0.0010 (0.0284) model time 0.5434 (0.8513) loss 7.1002 (7.2368) grad_norm 2.6956 (2.5396) loss_scale 128.0000 (128.0000) mem 22343MB +[2024-07-27 23:20:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 238 training takes 0:00:35 +[2024-07-27 23:20:07 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 23:20:11 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 23:20:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.727 (0.727) Loss 0.4934 (0.4934) Acc@1 90.088 (90.088) Acc@5 98.779 (98.779) Mem 22343MB +[2024-07-27 23:20:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.173) Loss 0.7412 (0.6035) Acc@1 83.154 (87.939) Acc@5 97.070 (98.091) Mem 22343MB +[2024-07-27 23:20:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.146) Loss 0.8433 (0.6939) Acc@1 80.029 (85.231) Acc@5 96.045 (97.231) Mem 22343MB +[2024-07-27 23:20:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.845 Acc@5 97.255 +[2024-07-27 23:20:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.8% +[2024-07-27 23:20:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 1.068 (1.068) Loss 0.5049 (0.5049) Acc@1 90.430 (90.430) Acc@5 98.975 (98.975) Mem 22343MB +[2024-07-27 23:20:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.209) Loss 0.7441 (0.6137) Acc@1 83.154 (87.917) Acc@5 96.924 (98.105) Mem 22343MB +[2024-07-27 23:20:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.165) Loss 0.8462 (0.7001) Acc@1 79.883 (85.107) Acc@5 96.191 (97.301) Mem 22343MB +[2024-07-27 23:20:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.719 Acc@5 97.289 +[2024-07-27 23:20:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.7% +[2024-07-27 23:20:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.72% +[2024-07-27 23:20:22 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-27 23:20:25 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-27 23:20:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][0/625] eta 0:16:14 lr 0.000146 wd 0.0500 time 1.5599 (1.5599) data time 0.5880 (0.5880) model time 0.0000 (0.0000) loss 6.9377 (6.9377) grad_norm 4.0818 (4.0818) loss_scale 128.0000 (128.0000) mem 22337MB +[2024-07-27 23:20:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][10/625] eta 0:06:25 lr 0.000146 wd 0.0500 time 0.5258 (0.6270) data time 0.0009 (0.0545) model time 0.0000 (0.0000) loss 7.1854 (7.0420) grad_norm 2.1510 (2.8585) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:20:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][20/625] eta 0:05:57 lr 0.000146 wd 0.0500 time 0.5544 (0.5909) data time 0.0009 (0.0290) model time 0.0000 (0.0000) loss 6.2156 (6.9533) grad_norm 2.8204 (2.7708) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:20:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][30/625] eta 0:05:39 lr 0.000146 wd 0.0500 time 0.5162 (0.5706) data time 0.0010 (0.0202) model time 0.0000 (0.0000) loss 8.7621 (6.9714) grad_norm 2.3708 (2.6737) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:20:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][40/625] eta 0:05:27 lr 0.000146 wd 0.0500 time 0.5441 (0.5605) data time 0.0011 (0.0156) model time 0.0000 (0.0000) loss 6.0578 (6.9821) grad_norm 2.0490 (2.6038) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:20:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][50/625] eta 0:05:18 lr 0.000145 wd 0.0500 time 0.5259 (0.5535) data time 0.0007 (0.0127) model time 0.0000 (0.0000) loss 7.6594 (6.9992) grad_norm 2.2423 (2.6757) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:20:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][60/625] eta 0:05:11 lr 0.000145 wd 0.0500 time 0.5442 (0.5506) data time 0.0011 (0.0108) model time 0.5431 (0.5347) loss 7.7647 (7.0891) grad_norm 2.3952 (2.7373) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:21:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][70/625] eta 0:05:03 lr 0.000145 wd 0.0500 time 0.5252 (0.5468) data time 0.0010 (0.0094) model time 0.5242 (0.5286) loss 7.1687 (7.0822) grad_norm 2.3650 (2.7091) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:21:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][80/625] eta 0:04:56 lr 0.000145 wd 0.0500 time 0.5173 (0.5438) data time 0.0008 (0.0084) model time 0.5165 (0.5261) loss 6.8973 (7.0591) grad_norm 2.2250 (2.9214) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:21:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][90/625] eta 0:04:50 lr 0.000145 wd 0.0500 time 0.5169 (0.5423) data time 0.0009 (0.0076) model time 0.5161 (0.5268) loss 8.1882 (7.0736) grad_norm 3.7101 (2.8776) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:21:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][100/625] eta 0:04:44 lr 0.000145 wd 0.0500 time 0.5258 (0.5412) data time 0.0009 (0.0069) model time 0.5249 (0.5276) loss 8.5819 (7.0983) grad_norm 2.3249 (2.8690) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:21:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][110/625] eta 0:04:37 lr 0.000145 wd 0.0500 time 0.5176 (0.5397) data time 0.0007 (0.0064) model time 0.5169 (0.5268) loss 6.7417 (7.0835) grad_norm 2.0424 (2.8242) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:21:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][120/625] eta 0:04:32 lr 0.000145 wd 0.0500 time 0.5157 (0.5388) data time 0.0008 (0.0060) model time 0.5149 (0.5270) loss 5.7219 (7.0808) grad_norm 3.1350 (2.7951) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:21:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][130/625] eta 0:04:25 lr 0.000145 wd 0.0500 time 0.5178 (0.5372) data time 0.0009 (0.0056) model time 0.5169 (0.5256) loss 8.3870 (7.0866) grad_norm 2.6343 (2.8013) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:21:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][140/625] eta 0:04:20 lr 0.000145 wd 0.0500 time 0.5135 (0.5366) data time 0.0010 (0.0053) model time 0.5125 (0.5258) loss 8.0190 (7.0721) grad_norm 2.4832 (2.7713) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:21:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][150/625] eta 0:04:14 lr 0.000145 wd 0.0500 time 0.5150 (0.5357) data time 0.0008 (0.0050) model time 0.5142 (0.5255) loss 6.9435 (7.0708) grad_norm 2.3910 (2.7588) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:21:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][160/625] eta 0:04:08 lr 0.000145 wd 0.0500 time 0.5214 (0.5352) data time 0.0007 (0.0048) model time 0.5206 (0.5256) loss 6.8674 (7.0567) grad_norm 2.6622 (2.7451) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:21:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][170/625] eta 0:04:03 lr 0.000145 wd 0.0500 time 0.5659 (0.5345) data time 0.0009 (0.0045) model time 0.5650 (0.5253) loss 5.4801 (7.0452) grad_norm 3.9896 (2.7409) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:22:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][180/625] eta 0:03:58 lr 0.000145 wd 0.0500 time 0.5180 (0.5353) data time 0.0007 (0.0044) model time 0.5173 (0.5270) loss 5.6994 (7.0424) grad_norm 2.9548 (2.7410) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:22:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][190/625] eta 0:03:52 lr 0.000144 wd 0.0500 time 0.5179 (0.5349) data time 0.0010 (0.0042) model time 0.5169 (0.5269) loss 6.7093 (7.0374) grad_norm 2.7398 (2.7215) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:22:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][200/625] eta 0:03:47 lr 0.000144 wd 0.0500 time 0.5175 (0.5345) data time 0.0009 (0.0041) model time 0.5167 (0.5270) loss 8.6640 (7.0554) grad_norm 2.7630 (2.7080) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:22:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][210/625] eta 0:03:41 lr 0.000144 wd 0.0500 time 0.5673 (0.5343) data time 0.0010 (0.0039) model time 0.5663 (0.5270) loss 7.7442 (7.0381) grad_norm 2.1890 (2.7281) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:22:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][220/625] eta 0:03:36 lr 0.000144 wd 0.0500 time 0.5353 (0.5339) data time 0.0012 (0.0038) model time 0.5341 (0.5269) loss 6.3829 (7.0157) grad_norm 2.4789 (2.7352) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:22:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][230/625] eta 0:03:30 lr 0.000144 wd 0.0500 time 0.5522 (0.5334) data time 0.0005 (0.0037) model time 0.5516 (0.5266) loss 7.2876 (7.0118) grad_norm 2.6750 (2.7371) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:22:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][240/625] eta 0:03:25 lr 0.000144 wd 0.0500 time 0.5169 (0.5339) data time 0.0008 (0.0036) model time 0.5161 (0.5275) loss 7.9655 (7.0252) grad_norm 2.2124 (2.7251) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:22:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][250/625] eta 0:03:20 lr 0.000144 wd 0.0500 time 0.5280 (0.5337) data time 0.0012 (0.0035) model time 0.5269 (0.5275) loss 6.6194 (7.0154) grad_norm 1.8542 (2.8068) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:22:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][260/625] eta 0:03:14 lr 0.000144 wd 0.0500 time 0.5169 (0.5333) data time 0.0008 (0.0034) model time 0.5162 (0.5273) loss 6.7795 (7.0075) grad_norm 1.9493 (2.8856) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:22:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][270/625] eta 0:03:09 lr 0.000144 wd 0.0500 time 0.5279 (0.5332) data time 0.0018 (0.0033) model time 0.5261 (0.5274) loss 7.4560 (7.0102) grad_norm 1.9067 (2.9158) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:22:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][280/625] eta 0:03:03 lr 0.000144 wd 0.0500 time 0.5255 (0.5326) data time 0.0008 (0.0032) model time 0.5247 (0.5269) loss 7.5415 (7.0277) grad_norm 1.7872 (2.9150) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:23:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][290/625] eta 0:02:58 lr 0.000144 wd 0.0500 time 0.5187 (0.5327) data time 0.0009 (0.0031) model time 0.5178 (0.5272) loss 8.0968 (7.0303) grad_norm 2.5371 (2.8969) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:23:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][300/625] eta 0:02:53 lr 0.000144 wd 0.0500 time 0.5230 (0.5323) data time 0.0009 (0.0031) model time 0.5222 (0.5269) loss 7.4193 (7.0388) grad_norm 2.0704 (2.8820) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:23:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][310/625] eta 0:02:47 lr 0.000144 wd 0.0500 time 0.5165 (0.5321) data time 0.0008 (0.0030) model time 0.5158 (0.5268) loss 7.8088 (7.0435) grad_norm 4.3163 (2.9312) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:23:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][320/625] eta 0:02:42 lr 0.000144 wd 0.0500 time 0.5188 (0.5316) data time 0.0007 (0.0029) model time 0.5181 (0.5264) loss 6.3145 (7.0378) grad_norm 1.6656 (2.9233) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:23:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][330/625] eta 0:02:36 lr 0.000144 wd 0.0500 time 0.5183 (0.5315) data time 0.0007 (0.0029) model time 0.5176 (0.5264) loss 7.8254 (7.0418) grad_norm 2.6660 (2.9154) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:23:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][340/625] eta 0:02:31 lr 0.000143 wd 0.0500 time 0.5177 (0.5312) data time 0.0009 (0.0028) model time 0.5167 (0.5262) loss 6.4003 (7.0360) grad_norm 2.5303 (2.9054) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:23:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][350/625] eta 0:02:26 lr 0.000143 wd 0.0500 time 0.5163 (0.5312) data time 0.0011 (0.0028) model time 0.5152 (0.5264) loss 8.5236 (7.0364) grad_norm 2.2836 (2.8861) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:23:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][360/625] eta 0:02:20 lr 0.000143 wd 0.0500 time 0.5489 (0.5310) data time 0.0008 (0.0027) model time 0.5480 (0.5261) loss 7.1096 (7.0465) grad_norm 3.1697 (2.8753) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:23:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][370/625] eta 0:02:15 lr 0.000143 wd 0.0500 time 0.5155 (0.5308) data time 0.0009 (0.0027) model time 0.5146 (0.5261) loss 7.6320 (7.0544) grad_norm 1.7622 (2.8732) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:23:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][380/625] eta 0:02:10 lr 0.000143 wd 0.0500 time 0.5196 (0.5308) data time 0.0007 (0.0026) model time 0.5189 (0.5261) loss 6.8348 (7.0500) grad_norm 2.1064 (2.8686) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:23:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][390/625] eta 0:02:04 lr 0.000143 wd 0.0500 time 0.5167 (0.5307) data time 0.0007 (0.0026) model time 0.5160 (0.5262) loss 7.3742 (7.0593) grad_norm 2.6830 (2.8547) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:23:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][400/625] eta 0:01:59 lr 0.000143 wd 0.0500 time 0.5547 (0.5309) data time 0.0007 (0.0026) model time 0.5540 (0.5265) loss 6.5526 (7.0588) grad_norm 2.0574 (2.8364) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:24:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][410/625] eta 0:01:54 lr 0.000143 wd 0.0500 time 0.5504 (0.5308) data time 0.0010 (0.0025) model time 0.5495 (0.5265) loss 5.2970 (7.0524) grad_norm 2.5536 (2.8308) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:24:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][420/625] eta 0:01:48 lr 0.000143 wd 0.0500 time 0.5124 (0.5306) data time 0.0012 (0.0025) model time 0.5112 (0.5264) loss 6.4142 (7.0412) grad_norm 2.2537 (2.8200) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:24:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][430/625] eta 0:01:43 lr 0.000143 wd 0.0500 time 0.5170 (0.5305) data time 0.0007 (0.0025) model time 0.5163 (0.5263) loss 6.4267 (7.0295) grad_norm 1.7878 (2.8039) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:24:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][440/625] eta 0:01:38 lr 0.000143 wd 0.0500 time 0.5384 (0.5305) data time 0.0006 (0.0024) model time 0.5377 (0.5264) loss 7.2126 (7.0294) grad_norm 2.3645 (2.7845) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:24:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][450/625] eta 0:01:32 lr 0.000143 wd 0.0500 time 0.5172 (0.5304) data time 0.0007 (0.0024) model time 0.5165 (0.5263) loss 6.3067 (7.0329) grad_norm 2.3178 (2.7688) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:24:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][460/625] eta 0:01:27 lr 0.000143 wd 0.0500 time 0.5157 (0.5308) data time 0.0007 (0.0024) model time 0.5151 (0.5268) loss 6.1528 (7.0283) grad_norm 1.6755 (2.7545) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:24:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][470/625] eta 0:01:22 lr 0.000143 wd 0.0500 time 0.5151 (0.5306) data time 0.0007 (0.0023) model time 0.5144 (0.5267) loss 8.3174 (7.0444) grad_norm 2.0431 (2.7398) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:24:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][480/625] eta 0:01:16 lr 0.000143 wd 0.0500 time 0.5181 (0.5306) data time 0.0009 (0.0023) model time 0.5172 (0.5268) loss 8.3607 (7.0412) grad_norm 2.3479 (2.7382) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:24:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][490/625] eta 0:01:11 lr 0.000142 wd 0.0500 time 0.5335 (0.5305) data time 0.0007 (0.0023) model time 0.5328 (0.5267) loss 6.8552 (7.0373) grad_norm 3.2074 (2.7367) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:24:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][500/625] eta 0:01:06 lr 0.000142 wd 0.0500 time 0.5223 (0.5304) data time 0.0010 (0.0023) model time 0.5214 (0.5267) loss 6.6968 (7.0335) grad_norm 1.7618 (2.7302) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:24:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][510/625] eta 0:01:00 lr 0.000142 wd 0.0500 time 0.5321 (0.5302) data time 0.0007 (0.0022) model time 0.5314 (0.5265) loss 7.1328 (7.0318) grad_norm 2.1491 (2.7179) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:25:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][520/625] eta 0:00:55 lr 0.000142 wd 0.0500 time 0.5179 (0.5302) data time 0.0009 (0.0022) model time 0.5170 (0.5266) loss 7.4459 (7.0377) grad_norm 2.0577 (2.7145) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:25:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][530/625] eta 0:00:50 lr 0.000142 wd 0.0500 time 0.5181 (0.5301) data time 0.0007 (0.0022) model time 0.5174 (0.5265) loss 6.4717 (7.0413) grad_norm 4.9562 (2.7151) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:25:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][540/625] eta 0:00:45 lr 0.000142 wd 0.0500 time 0.5149 (0.5302) data time 0.0012 (0.0022) model time 0.5137 (0.5267) loss 7.7978 (7.0433) grad_norm 2.3083 (2.7126) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:25:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][550/625] eta 0:00:39 lr 0.000142 wd 0.0500 time 0.5182 (0.5302) data time 0.0009 (0.0022) model time 0.5173 (0.5267) loss 6.4574 (7.0457) grad_norm 2.4236 (2.7032) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:25:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][560/625] eta 0:00:34 lr 0.000142 wd 0.0500 time 0.6220 (0.5303) data time 0.0009 (0.0021) model time 0.6211 (0.5269) loss 6.4897 (7.0450) grad_norm 1.7067 (2.6968) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:25:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][570/625] eta 0:00:29 lr 0.000142 wd 0.0500 time 0.5509 (0.5302) data time 0.0009 (0.0021) model time 0.5499 (0.5268) loss 7.4530 (7.0425) grad_norm 2.4894 (2.6997) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:25:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][580/625] eta 0:00:23 lr 0.000142 wd 0.0500 time 0.5163 (0.5302) data time 0.0006 (0.0021) model time 0.5157 (0.5269) loss 7.6235 (7.0513) grad_norm 3.8152 (2.7025) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:25:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][590/625] eta 0:00:18 lr 0.000142 wd 0.0500 time 0.5183 (0.5302) data time 0.0012 (0.0021) model time 0.5171 (0.5270) loss 7.6908 (7.0555) grad_norm 2.9710 (2.7014) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:25:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][600/625] eta 0:00:13 lr 0.000142 wd 0.0500 time 0.5197 (0.5302) data time 0.0007 (0.0021) model time 0.5190 (0.5269) loss 7.6707 (7.0526) grad_norm 1.9284 (2.6983) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:25:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][610/625] eta 0:00:07 lr 0.000142 wd 0.0500 time 0.5717 (0.5302) data time 0.0005 (0.0021) model time 0.5712 (0.5270) loss 7.6813 (7.0490) grad_norm 1.9345 (2.6894) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:25:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [239/300][620/625] eta 0:00:02 lr 0.000142 wd 0.0500 time 0.5170 (0.5303) data time 0.0006 (0.0021) model time 0.5164 (0.5271) loss 6.6773 (7.0410) grad_norm 2.2425 (2.6818) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:25:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 239 training takes 0:05:31 +[2024-07-27 23:25:57 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 23:25:58 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 23:25:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.615 (0.615) Loss 0.4949 (0.4949) Acc@1 90.576 (90.576) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-27 23:26:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.163) Loss 0.7461 (0.6054) Acc@1 83.008 (87.913) Acc@5 96.973 (98.065) Mem 22339MB +[2024-07-27 23:26:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.141) Loss 0.8398 (0.6925) Acc@1 80.664 (85.235) Acc@5 96.289 (97.303) Mem 22339MB +[2024-07-27 23:26:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.899 Acc@5 97.291 +[2024-07-27 23:26:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.9% +[2024-07-27 23:26:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 84.90% +[2024-07-27 23:26:02 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-27 23:26:04 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-27 23:26:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.595 (0.595) Loss 0.5044 (0.5044) Acc@1 90.430 (90.430) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-27 23:26:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.161) Loss 0.7441 (0.6132) Acc@1 83.154 (87.931) Acc@5 96.924 (98.096) Mem 22339MB +[2024-07-27 23:26:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.140) Loss 0.8452 (0.6995) Acc@1 79.932 (85.112) Acc@5 96.143 (97.296) Mem 22339MB +[2024-07-27 23:26:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.721 Acc@5 97.297 +[2024-07-27 23:26:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.7% +[2024-07-27 23:26:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.72% +[2024-07-27 23:26:07 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-27 23:26:09 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-27 23:26:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][0/625] eta 0:12:31 lr 0.000142 wd 0.0500 time 1.2031 (1.2031) data time 0.6839 (0.6839) model time 0.0000 (0.0000) loss 6.7557 (6.7557) grad_norm 2.8301 (2.8301) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:26:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][10/625] eta 0:05:59 lr 0.000142 wd 0.0500 time 0.5174 (0.5843) data time 0.0009 (0.0631) model time 0.0000 (0.0000) loss 7.3977 (7.0077) grad_norm 2.9788 (3.1101) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:26:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][20/625] eta 0:05:37 lr 0.000141 wd 0.0500 time 0.5196 (0.5584) data time 0.0007 (0.0335) model time 0.0000 (0.0000) loss 6.1983 (6.9014) grad_norm 1.8639 (3.0403) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:26:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][30/625] eta 0:05:25 lr 0.000141 wd 0.0500 time 0.5160 (0.5471) data time 0.0007 (0.0230) model time 0.0000 (0.0000) loss 7.2434 (6.9405) grad_norm 2.1218 (3.0639) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:26:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][40/625] eta 0:05:17 lr 0.000141 wd 0.0500 time 0.5171 (0.5430) data time 0.0009 (0.0177) model time 0.0000 (0.0000) loss 6.4022 (6.9652) grad_norm 2.1493 (3.1061) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:26:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][50/625] eta 0:05:11 lr 0.000141 wd 0.0500 time 0.5175 (0.5421) data time 0.0007 (0.0144) model time 0.0000 (0.0000) loss 7.6340 (6.9741) grad_norm 2.6845 (3.0094) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:26:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][60/625] eta 0:05:04 lr 0.000141 wd 0.0500 time 0.5172 (0.5391) data time 0.0010 (0.0122) model time 0.5163 (0.5229) loss 6.7003 (6.9631) grad_norm 2.4362 (2.9160) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:26:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][70/625] eta 0:04:57 lr 0.000141 wd 0.0500 time 0.5164 (0.5367) data time 0.0011 (0.0106) model time 0.5153 (0.5220) loss 8.0983 (7.0323) grad_norm 2.2606 (2.7953) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:26:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][80/625] eta 0:04:52 lr 0.000141 wd 0.0500 time 0.5202 (0.5361) data time 0.0006 (0.0094) model time 0.5196 (0.5248) loss 6.5439 (7.0291) grad_norm 3.7622 (2.8861) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:26:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][90/625] eta 0:04:46 lr 0.000141 wd 0.0500 time 0.5135 (0.5348) data time 0.0009 (0.0085) model time 0.5126 (0.5244) loss 7.3860 (7.0286) grad_norm 2.6344 (2.8961) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:27:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][100/625] eta 0:04:40 lr 0.000141 wd 0.0500 time 0.5675 (0.5340) data time 0.0009 (0.0078) model time 0.5666 (0.5247) loss 6.8877 (7.0470) grad_norm 2.7177 (2.8771) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:27:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][110/625] eta 0:04:34 lr 0.000141 wd 0.0500 time 0.5503 (0.5332) data time 0.0007 (0.0072) model time 0.5496 (0.5245) loss 7.9448 (7.0573) grad_norm 2.4526 (3.0688) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:27:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][120/625] eta 0:04:29 lr 0.000141 wd 0.0500 time 0.5170 (0.5341) data time 0.0007 (0.0067) model time 0.5163 (0.5272) loss 7.3676 (7.0638) grad_norm 2.7640 (3.0155) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:27:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][130/625] eta 0:04:24 lr 0.000141 wd 0.0500 time 0.5364 (0.5334) data time 0.0011 (0.0063) model time 0.5353 (0.5267) loss 6.4448 (7.0510) grad_norm 2.3703 (3.0177) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:27:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][140/625] eta 0:04:18 lr 0.000141 wd 0.0500 time 0.5201 (0.5328) data time 0.0010 (0.0059) model time 0.5191 (0.5265) loss 8.5354 (7.0454) grad_norm 2.8225 (3.0067) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:27:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][150/625] eta 0:04:13 lr 0.000141 wd 0.0500 time 0.5771 (0.5327) data time 0.0011 (0.0056) model time 0.5760 (0.5269) loss 7.6225 (7.0439) grad_norm 3.3783 (2.9731) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:27:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][160/625] eta 0:04:07 lr 0.000141 wd 0.0500 time 0.5207 (0.5318) data time 0.0008 (0.0053) model time 0.5198 (0.5260) loss 7.3154 (7.0230) grad_norm 2.6890 (2.9416) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:27:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][170/625] eta 0:04:01 lr 0.000140 wd 0.0500 time 0.5167 (0.5318) data time 0.0007 (0.0050) model time 0.5160 (0.5263) loss 7.5469 (7.0181) grad_norm 2.3717 (2.9275) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:27:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][180/625] eta 0:03:56 lr 0.000140 wd 0.0500 time 0.5162 (0.5312) data time 0.0007 (0.0048) model time 0.5155 (0.5258) loss 5.5077 (6.9967) grad_norm 1.6840 (2.8893) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:27:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][190/625] eta 0:03:50 lr 0.000140 wd 0.0500 time 0.5155 (0.5310) data time 0.0008 (0.0046) model time 0.5147 (0.5258) loss 6.2100 (7.0164) grad_norm 1.8266 (2.8546) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:27:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][200/625] eta 0:03:45 lr 0.000140 wd 0.0500 time 0.5295 (0.5305) data time 0.0010 (0.0044) model time 0.5285 (0.5255) loss 8.3115 (7.0251) grad_norm 1.7654 (2.8254) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:28:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][210/625] eta 0:03:40 lr 0.000140 wd 0.0500 time 0.5195 (0.5302) data time 0.0009 (0.0043) model time 0.5185 (0.5253) loss 7.3919 (7.0225) grad_norm 3.1146 (2.8081) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:28:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][220/625] eta 0:03:34 lr 0.000140 wd 0.0500 time 0.5175 (0.5299) data time 0.0009 (0.0042) model time 0.5167 (0.5251) loss 8.2951 (7.0380) grad_norm 1.5697 (2.8005) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:28:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][230/625] eta 0:03:29 lr 0.000140 wd 0.0500 time 0.5244 (0.5298) data time 0.0006 (0.0040) model time 0.5238 (0.5252) loss 6.1551 (7.0218) grad_norm 1.9283 (2.7784) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:28:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][240/625] eta 0:03:23 lr 0.000140 wd 0.0500 time 0.5179 (0.5295) data time 0.0008 (0.0039) model time 0.5171 (0.5250) loss 7.7945 (7.0304) grad_norm 2.6057 (2.7528) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:28:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][250/625] eta 0:03:18 lr 0.000140 wd 0.0500 time 0.5575 (0.5294) data time 0.0013 (0.0038) model time 0.5562 (0.5251) loss 7.0223 (7.0239) grad_norm 2.5930 (2.7345) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:28:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][260/625] eta 0:03:13 lr 0.000140 wd 0.0500 time 0.5183 (0.5290) data time 0.0007 (0.0037) model time 0.5176 (0.5247) loss 6.1469 (7.0275) grad_norm 2.9616 (2.7360) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:28:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][270/625] eta 0:03:08 lr 0.000140 wd 0.0500 time 0.7314 (0.5298) data time 0.0007 (0.0036) model time 0.7307 (0.5259) loss 6.4769 (7.0240) grad_norm 2.5087 (2.7164) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:28:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][280/625] eta 0:03:02 lr 0.000140 wd 0.0500 time 0.5174 (0.5296) data time 0.0006 (0.0035) model time 0.5167 (0.5257) loss 6.8718 (7.0231) grad_norm 2.4555 (2.7112) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:28:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][290/625] eta 0:02:57 lr 0.000140 wd 0.0500 time 0.5178 (0.5296) data time 0.0007 (0.0034) model time 0.5171 (0.5258) loss 7.1478 (7.0141) grad_norm 2.7139 (2.7348) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:28:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][300/625] eta 0:02:52 lr 0.000140 wd 0.0500 time 0.5615 (0.5295) data time 0.0018 (0.0033) model time 0.5597 (0.5258) loss 7.5392 (7.0202) grad_norm 1.9346 (2.7331) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:28:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][310/625] eta 0:02:46 lr 0.000140 wd 0.0500 time 0.5265 (0.5293) data time 0.0008 (0.0033) model time 0.5257 (0.5257) loss 6.0301 (7.0257) grad_norm 4.1866 (2.7409) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:28:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][320/625] eta 0:02:41 lr 0.000139 wd 0.0500 time 0.5941 (0.5293) data time 0.0013 (0.0032) model time 0.5928 (0.5258) loss 6.8421 (7.0367) grad_norm 2.7067 (2.7522) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:29:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][330/625] eta 0:02:36 lr 0.000139 wd 0.0500 time 0.5153 (0.5291) data time 0.0009 (0.0031) model time 0.5144 (0.5257) loss 7.2565 (7.0440) grad_norm 3.8415 (2.7593) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:29:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][340/625] eta 0:02:30 lr 0.000139 wd 0.0500 time 0.5449 (0.5297) data time 0.0012 (0.0031) model time 0.5436 (0.5264) loss 7.2558 (7.0516) grad_norm 1.9696 (2.7491) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:29:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][350/625] eta 0:02:25 lr 0.000139 wd 0.0500 time 0.5185 (0.5296) data time 0.0006 (0.0030) model time 0.5179 (0.5264) loss 6.2662 (7.0619) grad_norm 2.6805 (2.7822) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:29:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][360/625] eta 0:02:20 lr 0.000139 wd 0.0500 time 0.5347 (0.5294) data time 0.0010 (0.0030) model time 0.5337 (0.5262) loss 5.7017 (7.0573) grad_norm 2.0665 (2.7715) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:29:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][370/625] eta 0:02:14 lr 0.000139 wd 0.0500 time 0.5162 (0.5291) data time 0.0007 (0.0029) model time 0.5155 (0.5260) loss 8.2618 (7.0588) grad_norm 2.8668 (2.7806) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:29:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][380/625] eta 0:02:09 lr 0.000139 wd 0.0500 time 0.5166 (0.5291) data time 0.0013 (0.0029) model time 0.5152 (0.5260) loss 6.0991 (7.0464) grad_norm 2.2484 (2.8291) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:29:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][390/625] eta 0:02:04 lr 0.000139 wd 0.0500 time 0.5169 (0.5290) data time 0.0008 (0.0028) model time 0.5161 (0.5259) loss 8.5517 (7.0549) grad_norm 3.1299 (2.8246) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:29:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][400/625] eta 0:01:59 lr 0.000139 wd 0.0500 time 0.5159 (0.5291) data time 0.0008 (0.0028) model time 0.5151 (0.5261) loss 7.0886 (7.0547) grad_norm 2.8938 (2.8201) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:29:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][410/625] eta 0:01:53 lr 0.000139 wd 0.0500 time 0.5793 (0.5289) data time 0.0009 (0.0027) model time 0.5784 (0.5260) loss 7.5590 (7.0476) grad_norm 1.8416 (2.8315) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:29:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][420/625] eta 0:01:48 lr 0.000139 wd 0.0500 time 0.5179 (0.5288) data time 0.0007 (0.0027) model time 0.5171 (0.5259) loss 6.1818 (7.0443) grad_norm 3.1953 (2.8230) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:29:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][430/625] eta 0:01:43 lr 0.000139 wd 0.0500 time 0.5192 (0.5287) data time 0.0007 (0.0027) model time 0.5185 (0.5257) loss 5.9707 (7.0442) grad_norm 1.7628 (2.8041) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:30:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][440/625] eta 0:01:37 lr 0.000139 wd 0.0500 time 0.5178 (0.5287) data time 0.0007 (0.0026) model time 0.5172 (0.5258) loss 7.0845 (7.0480) grad_norm 1.8427 (2.7989) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:30:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][450/625] eta 0:01:32 lr 0.000139 wd 0.0500 time 0.5430 (0.5286) data time 0.0013 (0.0026) model time 0.5417 (0.5258) loss 8.1630 (7.0578) grad_norm 2.0694 (2.7848) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:30:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][460/625] eta 0:01:27 lr 0.000139 wd 0.0500 time 0.5188 (0.5285) data time 0.0008 (0.0026) model time 0.5180 (0.5257) loss 6.7368 (7.0578) grad_norm 2.0369 (2.7779) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:30:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][470/625] eta 0:01:21 lr 0.000138 wd 0.0500 time 0.5178 (0.5283) data time 0.0010 (0.0025) model time 0.5168 (0.5256) loss 7.7598 (7.0661) grad_norm 2.1038 (2.7817) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:30:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][480/625] eta 0:01:16 lr 0.000138 wd 0.0500 time 0.5186 (0.5283) data time 0.0009 (0.0025) model time 0.5178 (0.5256) loss 6.3030 (7.0655) grad_norm 3.0255 (2.7966) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:30:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][490/625] eta 0:01:11 lr 0.000138 wd 0.0500 time 0.8211 (0.5291) data time 0.0010 (0.0025) model time 0.8201 (0.5265) loss 6.2768 (7.0643) grad_norm 2.2993 (2.7918) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:30:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][500/625] eta 0:01:06 lr 0.000138 wd 0.0500 time 0.5164 (0.5290) data time 0.0011 (0.0025) model time 0.5153 (0.5265) loss 8.1118 (7.0675) grad_norm 4.3623 (2.7845) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:30:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][510/625] eta 0:01:00 lr 0.000138 wd 0.0500 time 0.5514 (0.5290) data time 0.0010 (0.0024) model time 0.5504 (0.5265) loss 7.7750 (7.0735) grad_norm 1.6965 (2.7725) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:30:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][520/625] eta 0:00:55 lr 0.000138 wd 0.0500 time 0.5162 (0.5289) data time 0.0007 (0.0024) model time 0.5155 (0.5263) loss 6.7404 (7.0835) grad_norm 2.4552 (2.7616) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:30:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][530/625] eta 0:00:50 lr 0.000138 wd 0.0500 time 0.5165 (0.5289) data time 0.0008 (0.0024) model time 0.5157 (0.5264) loss 6.6881 (7.0762) grad_norm 2.3906 (2.7561) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:30:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][540/625] eta 0:00:44 lr 0.000138 wd 0.0500 time 0.5176 (0.5288) data time 0.0009 (0.0024) model time 0.5167 (0.5263) loss 6.2081 (7.0761) grad_norm 1.8435 (2.7429) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:31:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][550/625] eta 0:00:39 lr 0.000138 wd 0.0500 time 0.5274 (0.5288) data time 0.0009 (0.0023) model time 0.5265 (0.5263) loss 6.3363 (7.0766) grad_norm 1.8401 (2.7491) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:31:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][560/625] eta 0:00:34 lr 0.000138 wd 0.0500 time 0.5376 (0.5289) data time 0.0007 (0.0023) model time 0.5369 (0.5265) loss 5.7276 (7.0738) grad_norm 2.7437 (2.7525) loss_scale 256.0000 (128.6845) mem 22339MB +[2024-07-27 23:31:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][570/625] eta 0:00:29 lr 0.000138 wd 0.0500 time 0.5168 (0.5288) data time 0.0008 (0.0023) model time 0.5159 (0.5264) loss 7.8526 (7.0781) grad_norm 1.9171 (2.7531) loss_scale 256.0000 (130.9142) mem 22339MB +[2024-07-27 23:31:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][580/625] eta 0:00:23 lr 0.000138 wd 0.0500 time 0.5173 (0.5288) data time 0.0008 (0.0023) model time 0.5164 (0.5264) loss 6.5079 (7.0762) grad_norm 5.9770 (2.7493) loss_scale 256.0000 (133.0671) mem 22339MB +[2024-07-27 23:31:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][590/625] eta 0:00:18 lr 0.000138 wd 0.0500 time 0.5154 (0.5288) data time 0.0006 (0.0023) model time 0.5148 (0.5264) loss 7.7484 (7.0733) grad_norm 2.3396 (2.7406) loss_scale 256.0000 (135.1472) mem 22339MB +[2024-07-27 23:31:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][600/625] eta 0:00:13 lr 0.000138 wd 0.0500 time 0.5116 (0.5287) data time 0.0009 (0.0023) model time 0.5107 (0.5263) loss 6.1486 (7.0806) grad_norm 2.1344 (2.7368) loss_scale 256.0000 (137.1581) mem 22339MB +[2024-07-27 23:31:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][610/625] eta 0:00:07 lr 0.000138 wd 0.0500 time 0.5285 (0.5287) data time 0.0005 (0.0022) model time 0.5280 (0.5263) loss 8.4351 (7.0883) grad_norm 1.9796 (2.7267) loss_scale 256.0000 (139.1031) mem 22339MB +[2024-07-27 23:31:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [240/300][620/625] eta 0:00:02 lr 0.000137 wd 0.0500 time 0.5137 (0.5287) data time 0.0007 (0.0022) model time 0.5130 (0.5264) loss 7.4797 (7.0916) grad_norm 1.8312 (2.7269) loss_scale 256.0000 (140.9855) mem 22339MB +[2024-07-27 23:31:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 240 training takes 0:05:30 +[2024-07-27 23:31:39 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 23:31:41 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 23:31:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.625 (0.625) Loss 0.4980 (0.4980) Acc@1 90.771 (90.771) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-27 23:31:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.164) Loss 0.7539 (0.6124) Acc@1 82.568 (87.846) Acc@5 97.070 (98.100) Mem 22339MB +[2024-07-27 23:31:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.141) Loss 0.8506 (0.6995) Acc@1 81.055 (85.263) Acc@5 96.289 (97.296) Mem 22339MB +[2024-07-27 23:31:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.853 Acc@5 97.275 +[2024-07-27 23:31:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.9% +[2024-07-27 23:31:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 1.025 (1.025) Loss 0.5039 (0.5039) Acc@1 90.332 (90.332) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-27 23:31:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.203) Loss 0.7437 (0.6130) Acc@1 83.105 (87.926) Acc@5 96.875 (98.091) Mem 22339MB +[2024-07-27 23:31:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.162) Loss 0.8447 (0.6991) Acc@1 79.883 (85.128) Acc@5 96.143 (97.284) Mem 22339MB +[2024-07-27 23:31:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.751 Acc@5 97.287 +[2024-07-27 23:31:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.8% +[2024-07-27 23:31:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.75% +[2024-07-27 23:31:48 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-27 23:31:50 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-27 23:31:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][0/625] eta 0:11:03 lr 0.000137 wd 0.0500 time 1.0622 (1.0622) data time 0.5432 (0.5432) model time 0.0000 (0.0000) loss 8.1164 (8.1164) grad_norm 2.4248 (2.4248) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:31:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][10/625] eta 0:05:52 lr 0.000137 wd 0.0500 time 0.5219 (0.5729) data time 0.0009 (0.0503) model time 0.0000 (0.0000) loss 7.7142 (7.0667) grad_norm 2.3183 (2.8372) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:32:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][20/625] eta 0:05:36 lr 0.000137 wd 0.0500 time 0.5428 (0.5556) data time 0.0009 (0.0268) model time 0.0000 (0.0000) loss 5.4479 (6.9406) grad_norm 2.0356 (2.7313) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:32:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][30/625] eta 0:05:23 lr 0.000137 wd 0.0500 time 0.5466 (0.5440) data time 0.0010 (0.0185) model time 0.0000 (0.0000) loss 7.3385 (6.8769) grad_norm 1.8621 (2.5975) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:32:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][40/625] eta 0:05:15 lr 0.000137 wd 0.0500 time 0.5194 (0.5399) data time 0.0009 (0.0142) model time 0.0000 (0.0000) loss 7.5990 (6.9869) grad_norm 1.9396 (2.4962) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:32:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][50/625] eta 0:05:08 lr 0.000137 wd 0.0500 time 0.5180 (0.5372) data time 0.0008 (0.0116) model time 0.0000 (0.0000) loss 6.0541 (6.9933) grad_norm 3.1456 (2.4317) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:32:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][60/625] eta 0:05:02 lr 0.000137 wd 0.0500 time 0.5365 (0.5354) data time 0.0007 (0.0099) model time 0.5358 (0.5248) loss 6.2806 (7.0023) grad_norm 3.2030 (2.4443) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:32:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][70/625] eta 0:04:56 lr 0.000137 wd 0.0500 time 0.5588 (0.5344) data time 0.0008 (0.0087) model time 0.5579 (0.5261) loss 7.7742 (7.0720) grad_norm 2.7860 (2.5857) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:32:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][80/625] eta 0:04:50 lr 0.000137 wd 0.0500 time 0.5442 (0.5334) data time 0.0011 (0.0077) model time 0.5432 (0.5257) loss 7.2781 (7.0894) grad_norm 4.2459 (2.6628) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:32:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][90/625] eta 0:04:47 lr 0.000137 wd 0.0500 time 0.5570 (0.5372) data time 0.0008 (0.0070) model time 0.5563 (0.5361) loss 7.7993 (7.1051) grad_norm 3.2678 (2.6301) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:32:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][100/625] eta 0:04:41 lr 0.000137 wd 0.0500 time 0.5259 (0.5357) data time 0.0007 (0.0065) model time 0.5251 (0.5329) loss 6.6620 (7.0832) grad_norm 10.1701 (2.7147) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:32:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][110/625] eta 0:04:35 lr 0.000137 wd 0.0500 time 0.5183 (0.5355) data time 0.0007 (0.0060) model time 0.5175 (0.5328) loss 8.2256 (7.1356) grad_norm 3.6213 (2.6946) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:32:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][120/625] eta 0:04:29 lr 0.000137 wd 0.0500 time 0.5386 (0.5345) data time 0.0009 (0.0055) model time 0.5377 (0.5313) loss 7.9820 (7.1384) grad_norm 2.5705 (2.6924) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:33:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][130/625] eta 0:04:24 lr 0.000137 wd 0.0500 time 0.5182 (0.5345) data time 0.0009 (0.0052) model time 0.5174 (0.5317) loss 6.7696 (7.1380) grad_norm 1.7915 (2.6944) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:33:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][140/625] eta 0:04:18 lr 0.000137 wd 0.0500 time 0.5180 (0.5334) data time 0.0011 (0.0049) model time 0.5169 (0.5302) loss 7.2208 (7.1256) grad_norm 2.1127 (2.6744) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:33:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][150/625] eta 0:04:13 lr 0.000136 wd 0.0500 time 0.5196 (0.5332) data time 0.0009 (0.0047) model time 0.5186 (0.5300) loss 8.2309 (7.1241) grad_norm 2.2468 (2.6504) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:33:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][160/625] eta 0:04:07 lr 0.000136 wd 0.0500 time 0.5217 (0.5328) data time 0.0008 (0.0044) model time 0.5209 (0.5296) loss 6.2757 (7.1216) grad_norm 2.3859 (2.6569) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:33:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][170/625] eta 0:04:02 lr 0.000136 wd 0.0500 time 0.5267 (0.5324) data time 0.0007 (0.0043) model time 0.5260 (0.5293) loss 7.1723 (7.1323) grad_norm 2.6086 (2.6758) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:33:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][180/625] eta 0:03:56 lr 0.000136 wd 0.0500 time 0.5504 (0.5320) data time 0.0010 (0.0041) model time 0.5494 (0.5289) loss 6.6820 (7.1300) grad_norm 2.6968 (2.7333) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:33:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][190/625] eta 0:03:51 lr 0.000136 wd 0.0500 time 0.5165 (0.5314) data time 0.0006 (0.0039) model time 0.5159 (0.5282) loss 5.9195 (7.1061) grad_norm 3.9482 (2.7113) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:33:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][200/625] eta 0:03:45 lr 0.000136 wd 0.0500 time 0.5177 (0.5310) data time 0.0008 (0.0038) model time 0.5169 (0.5278) loss 6.0112 (7.0829) grad_norm 3.1437 (2.6899) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:33:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][210/625] eta 0:03:40 lr 0.000136 wd 0.0500 time 0.5158 (0.5314) data time 0.0008 (0.0036) model time 0.5150 (0.5285) loss 7.6165 (7.0747) grad_norm 1.8579 (2.6849) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:33:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][220/625] eta 0:03:35 lr 0.000136 wd 0.0500 time 0.5240 (0.5312) data time 0.0010 (0.0035) model time 0.5231 (0.5284) loss 6.4437 (7.0902) grad_norm 2.6344 (2.7540) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:33:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][230/625] eta 0:03:29 lr 0.000136 wd 0.0500 time 0.5210 (0.5311) data time 0.0008 (0.0034) model time 0.5202 (0.5282) loss 7.0435 (7.1085) grad_norm 1.9319 (2.7498) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:33:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][240/625] eta 0:03:24 lr 0.000136 wd 0.0500 time 0.5612 (0.5310) data time 0.0009 (0.0033) model time 0.5603 (0.5283) loss 6.9868 (7.0921) grad_norm 2.7227 (2.7888) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:34:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][250/625] eta 0:03:19 lr 0.000136 wd 0.0500 time 0.5166 (0.5310) data time 0.0007 (0.0033) model time 0.5158 (0.5282) loss 6.7005 (7.0982) grad_norm 1.8242 (2.7730) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:34:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][260/625] eta 0:03:13 lr 0.000136 wd 0.0500 time 0.6189 (0.5312) data time 0.0016 (0.0032) model time 0.6173 (0.5286) loss 8.3937 (7.0860) grad_norm 3.6884 (2.7653) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:34:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][270/625] eta 0:03:08 lr 0.000136 wd 0.0500 time 0.5233 (0.5309) data time 0.0009 (0.0031) model time 0.5224 (0.5283) loss 6.3697 (7.0871) grad_norm 12.6651 (2.8083) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:34:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][280/625] eta 0:03:03 lr 0.000136 wd 0.0500 time 0.5878 (0.5311) data time 0.0009 (0.0030) model time 0.5869 (0.5286) loss 6.8764 (7.0852) grad_norm 2.7885 (2.8095) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:34:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][290/625] eta 0:02:57 lr 0.000136 wd 0.0500 time 0.5195 (0.5309) data time 0.0009 (0.0030) model time 0.5187 (0.5284) loss 6.9823 (7.0807) grad_norm 2.1065 (2.8485) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:34:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][300/625] eta 0:02:52 lr 0.000136 wd 0.0500 time 0.5191 (0.5309) data time 0.0009 (0.0029) model time 0.5182 (0.5285) loss 7.0696 (7.0895) grad_norm 3.2757 (2.8623) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:34:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][310/625] eta 0:02:47 lr 0.000135 wd 0.0500 time 0.5180 (0.5326) data time 0.0010 (0.0029) model time 0.5170 (0.5306) loss 7.7997 (7.0938) grad_norm 9.2794 (2.8803) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:34:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][320/625] eta 0:02:42 lr 0.000135 wd 0.0500 time 0.5262 (0.5327) data time 0.0008 (0.0028) model time 0.5253 (0.5307) loss 6.0226 (7.0881) grad_norm 2.9701 (2.8789) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-27 23:34:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-27 23:34:46 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 23:34:48 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 23:37:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 23:37:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 23:37:57 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 23:38:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-27 23:38:11 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-27 23:38:12 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-27 23:38:12 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-27 23:38:12 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 241) +[2024-07-27 23:38:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-27 23:38:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][330/625] eta 0:47:18 lr 0.000135 wd 0.0500 time 9.6214 (9.6214) data time 0.5517 (0.5517) model time 9.0698 (9.0698) loss 7.6156 (7.6156) grad_norm 2.6670 (2.6670) loss_scale 256.0000 (256.0000) mem 26017MB +[2024-07-27 23:38:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][340/625] eta 0:07:09 lr 0.000135 wd 0.0500 time 0.5867 (1.5084) data time 0.0008 (0.0509) model time 0.5858 (1.4575) loss 6.2044 (7.5772) grad_norm 2.4415 (2.5653) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 23:38:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][350/625] eta 0:04:54 lr 0.000135 wd 0.0500 time 0.5858 (1.0703) data time 0.0009 (0.0271) model time 0.5849 (1.0433) loss 6.4583 (7.3829) grad_norm 2.1045 (2.4220) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 23:38:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][360/625] eta 0:04:02 lr 0.000135 wd 0.0500 time 0.5837 (0.9144) data time 0.0007 (0.0186) model time 0.5831 (0.8957) loss 6.1062 (7.3257) grad_norm 2.3434 (2.3733) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 23:38:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][370/625] eta 0:03:32 lr 0.000135 wd 0.0500 time 0.5829 (0.8343) data time 0.0008 (0.0143) model time 0.5821 (0.8200) loss 6.5149 (7.2115) grad_norm 1.9570 (2.5065) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 23:38:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][380/625] eta 0:03:13 lr 0.000135 wd 0.0500 time 0.8090 (0.7901) data time 0.0006 (0.0117) model time 0.8084 (0.7784) loss 6.9817 (7.1980) grad_norm 1.8959 (2.4563) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 23:39:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][390/625] eta 0:02:58 lr 0.000135 wd 0.0500 time 0.5879 (0.7595) data time 0.0009 (0.0099) model time 0.5870 (0.7496) loss 6.2970 (7.1781) grad_norm 2.8347 (2.4431) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-27 23:39:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][400/625] eta 0:02:45 lr 0.000135 wd 0.0500 time 0.5879 (0.7355) data time 0.0009 (0.0086) model time 0.5870 (0.7269) loss 6.8875 (7.1418) grad_norm 2.0259 (inf) loss_scale 128.0000 (250.5915) mem 22344MB +[2024-07-27 23:39:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][410/625] eta 0:02:34 lr 0.000135 wd 0.0500 time 0.5900 (0.7175) data time 0.0009 (0.0077) model time 0.5892 (0.7098) loss 7.1763 (7.1260) grad_norm 1.9249 (inf) loss_scale 128.0000 (235.4568) mem 22344MB +[2024-07-27 23:39:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][420/625] eta 0:02:24 lr 0.000135 wd 0.0500 time 0.5875 (0.7034) data time 0.0006 (0.0069) model time 0.5869 (0.6965) loss 6.9878 (7.0956) grad_norm 6.1237 (inf) loss_scale 128.0000 (223.6484) mem 22344MB +[2024-07-27 23:39:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][430/625] eta 0:02:14 lr 0.000135 wd 0.0500 time 0.5873 (0.6920) data time 0.0007 (0.0063) model time 0.5866 (0.6857) loss 6.7057 (7.1236) grad_norm 2.3389 (inf) loss_scale 128.0000 (214.1782) mem 22344MB +[2024-07-27 23:39:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][440/625] eta 0:02:06 lr 0.000135 wd 0.0500 time 0.5872 (0.6827) data time 0.0008 (0.0058) model time 0.5863 (0.6768) loss 6.3638 (7.1274) grad_norm 2.0129 (inf) loss_scale 128.0000 (206.4144) mem 22344MB +[2024-07-27 23:39:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][450/625] eta 0:01:58 lr 0.000135 wd 0.0500 time 0.5893 (0.6751) data time 0.0006 (0.0054) model time 0.5887 (0.6696) loss 6.5599 (7.1407) grad_norm 1.8299 (inf) loss_scale 128.0000 (199.9339) mem 22344MB +[2024-07-27 23:39:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][460/625] eta 0:01:50 lr 0.000134 wd 0.0500 time 0.5855 (0.6686) data time 0.0009 (0.0051) model time 0.5847 (0.6635) loss 7.8943 (7.1300) grad_norm 1.7005 (inf) loss_scale 128.0000 (194.4427) mem 22344MB +[2024-07-27 23:39:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][470/625] eta 0:01:42 lr 0.000134 wd 0.0500 time 0.5918 (0.6631) data time 0.0007 (0.0048) model time 0.5912 (0.6583) loss 7.0853 (7.1252) grad_norm 2.2134 (inf) loss_scale 128.0000 (189.7305) mem 22344MB +[2024-07-27 23:39:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][480/625] eta 0:01:35 lr 0.000134 wd 0.0500 time 0.5920 (0.6583) data time 0.0008 (0.0045) model time 0.5912 (0.6538) loss 7.2016 (7.1133) grad_norm 2.5454 (inf) loss_scale 128.0000 (185.6424) mem 22344MB +[2024-07-27 23:40:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][490/625] eta 0:01:28 lr 0.000134 wd 0.0500 time 0.5913 (0.6542) data time 0.0009 (0.0043) model time 0.5904 (0.6499) loss 7.2255 (7.1191) grad_norm 1.7513 (inf) loss_scale 128.0000 (182.0621) mem 22344MB +[2024-07-27 23:40:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][500/625] eta 0:01:21 lr 0.000134 wd 0.0500 time 0.5915 (0.6505) data time 0.0008 (0.0041) model time 0.5907 (0.6464) loss 6.6349 (7.1314) grad_norm 1.9344 (inf) loss_scale 128.0000 (178.9006) mem 22344MB +[2024-07-27 23:40:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][510/625] eta 0:01:14 lr 0.000134 wd 0.0500 time 0.5902 (0.6471) data time 0.0009 (0.0039) model time 0.5893 (0.6432) loss 8.2188 (7.1148) grad_norm 2.0719 (inf) loss_scale 128.0000 (176.0884) mem 22344MB +[2024-07-27 23:40:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][520/625] eta 0:01:07 lr 0.000134 wd 0.0500 time 0.5904 (0.6441) data time 0.0009 (0.0037) model time 0.5895 (0.6404) loss 7.0013 (7.1154) grad_norm 2.3677 (inf) loss_scale 128.0000 (173.5707) mem 22344MB +[2024-07-27 23:40:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][530/625] eta 0:01:00 lr 0.000134 wd 0.0500 time 0.5892 (0.6414) data time 0.0009 (0.0036) model time 0.5883 (0.6378) loss 6.5830 (7.0895) grad_norm 2.7874 (inf) loss_scale 128.0000 (171.3035) mem 22344MB +[2024-07-27 23:40:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][540/625] eta 0:00:54 lr 0.000134 wd 0.0500 time 0.5896 (0.6391) data time 0.0008 (0.0035) model time 0.5887 (0.6356) loss 8.7999 (7.0810) grad_norm 3.1153 (inf) loss_scale 128.0000 (169.2512) mem 22344MB +[2024-07-27 23:40:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][550/625] eta 0:00:47 lr 0.000134 wd 0.0500 time 0.5909 (0.6369) data time 0.0007 (0.0034) model time 0.5901 (0.6335) loss 6.7331 (7.0656) grad_norm 2.6935 (inf) loss_scale 128.0000 (167.3846) mem 22344MB +[2024-07-27 23:40:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][560/625] eta 0:00:41 lr 0.000134 wd 0.0500 time 0.5945 (0.6350) data time 0.0007 (0.0033) model time 0.5938 (0.6318) loss 6.0151 (7.0752) grad_norm 1.9507 (inf) loss_scale 128.0000 (165.6797) mem 22344MB +[2024-07-27 23:40:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][570/625] eta 0:00:34 lr 0.000134 wd 0.0500 time 0.5852 (0.6332) data time 0.0007 (0.0032) model time 0.5845 (0.6301) loss 6.4531 (7.0630) grad_norm 1.9593 (inf) loss_scale 128.0000 (164.1162) mem 22344MB +[2024-07-27 23:40:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-27 23:40:53 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 23:40:58 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 23:46:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 23:46:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 23:46:59 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 23:47:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-27 23:47:17 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-27 23:47:17 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-27 23:47:17 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-27 23:47:17 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 241) +[2024-07-27 23:47:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-27 23:47:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][580/625] eta 0:04:05 lr 0.000134 wd 0.0500 time 1.6057 (5.4601) data time 0.0008 (0.3275) model time 1.6049 (5.1325) loss 7.8190 (8.0996) grad_norm 3.1672 (2.6280) loss_scale 128.0000 (128.0000) mem 22342MB +[2024-07-27 23:47:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][590/625] eta 0:00:48 lr 0.000134 wd 0.0500 time 0.5757 (1.3915) data time 0.0008 (0.0553) model time 0.5749 (1.3361) loss 6.1797 (7.2013) grad_norm 2.2443 (2.4321) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:47:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][600/625] eta 0:00:25 lr 0.000134 wd 0.0500 time 0.5778 (1.0221) data time 0.0008 (0.0306) model time 0.5769 (0.9915) loss 7.3836 (7.2172) grad_norm 2.9181 (3.3776) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:47:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][610/625] eta 0:00:13 lr 0.000133 wd 0.0500 time 0.5765 (0.8824) data time 0.0004 (0.0214) model time 0.5761 (0.8610) loss 7.1972 (7.2981) grad_norm 1.9414 (3.0851) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:47:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [241/300][620/625] eta 0:00:04 lr 0.000133 wd 0.0500 time 0.5749 (0.8094) data time 0.0006 (0.0164) model time 0.5743 (0.7929) loss 7.9469 (7.2607) grad_norm 3.5825 (3.0191) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-27 23:47:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 241 training takes 0:00:36 +[2024-07-27 23:47:58 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 23:48:01 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 23:48:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.456 (0.456) Loss 0.4990 (0.4990) Acc@1 90.234 (90.234) Acc@5 98.926 (98.926) Mem 22341MB +[2024-07-27 23:48:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.155) Loss 0.7559 (0.6120) Acc@1 82.861 (87.806) Acc@5 96.973 (98.109) Mem 22341MB +[2024-07-27 23:48:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.124 (0.141) Loss 0.8525 (0.7007) Acc@1 80.225 (85.182) Acc@5 96.143 (97.282) Mem 22341MB +[2024-07-27 23:48:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.827 Acc@5 97.273 +[2024-07-27 23:48:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.8% +[2024-07-27 23:48:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 1.227 (1.227) Loss 0.5034 (0.5034) Acc@1 90.332 (90.332) Acc@5 98.975 (98.975) Mem 22341MB +[2024-07-27 23:48:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.124 (0.225) Loss 0.7427 (0.6127) Acc@1 83.105 (87.939) Acc@5 96.924 (98.109) Mem 22341MB +[2024-07-27 23:48:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.124 (0.177) Loss 0.8442 (0.6987) Acc@1 80.078 (85.159) Acc@5 96.094 (97.294) Mem 22341MB +[2024-07-27 23:48:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.781 Acc@5 97.297 +[2024-07-27 23:48:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.8% +[2024-07-27 23:48:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.78% +[2024-07-27 23:48:11 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-27 23:48:15 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-27 23:48:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][0/625] eta 0:11:37 lr 0.000133 wd 0.0500 time 1.1153 (1.1153) data time 0.3822 (0.3822) model time 0.0000 (0.0000) loss 7.6345 (7.6345) grad_norm 2.0634 (2.0634) loss_scale 128.0000 (128.0000) mem 22337MB +[2024-07-27 23:48:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][10/625] eta 0:06:39 lr 0.000133 wd 0.0500 time 0.5788 (0.6495) data time 0.0009 (0.0355) model time 0.0000 (0.0000) loss 7.9489 (7.0456) grad_norm 2.5505 (3.3652) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-27 23:48:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-27 23:48:23 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 23:48:26 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-27 23:51:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-27 23:51:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-27 23:51:59 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-27 23:52:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-27 23:52:08 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-27 23:52:08 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-27 23:52:08 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-27 23:52:08 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 242) +[2024-07-27 23:52:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-27 23:52:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][20/625] eta 0:16:25 lr 0.000133 wd 0.0500 time 0.5699 (1.6296) data time 0.0011 (0.0738) model time 0.0000 (0.0000) loss 8.0241 (7.5386) grad_norm 2.4194 (2.3036) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-27 23:52:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][30/625] eta 0:10:55 lr 0.000133 wd 0.0500 time 0.5755 (1.1017) data time 0.0007 (0.0374) model time 0.0000 (0.0000) loss 7.4949 (7.2600) grad_norm 2.1914 (2.2573) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-27 23:52:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][40/625] eta 0:09:01 lr 0.000133 wd 0.0500 time 0.5733 (0.9255) data time 0.0010 (0.0252) model time 0.0000 (0.0000) loss 7.9731 (7.4153) grad_norm 1.7802 (2.3531) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-27 23:52:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][50/625] eta 0:08:01 lr 0.000133 wd 0.0500 time 0.5740 (0.8375) data time 0.0007 (0.0192) model time 0.0000 (0.0000) loss 6.6545 (7.3424) grad_norm 2.9394 (2.3863) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-27 23:52:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][60/625] eta 0:07:23 lr 0.000133 wd 0.0500 time 0.5744 (0.7849) data time 0.0009 (0.0156) model time 0.5735 (0.5737) loss 6.6811 (7.2850) grad_norm 1.6486 (2.3927) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-27 23:52:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][70/625] eta 0:06:59 lr 0.000133 wd 0.0500 time 0.5786 (0.7561) data time 0.0007 (0.0131) model time 0.5779 (0.5923) loss 6.9198 (7.2220) grad_norm 2.6913 (2.4348) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-27 23:53:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][80/625] eta 0:06:38 lr 0.000133 wd 0.0500 time 0.5761 (0.7305) data time 0.0007 (0.0114) model time 0.5755 (0.5870) loss 5.9551 (7.2133) grad_norm 1.9302 (2.4292) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-27 23:53:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][90/625] eta 0:06:20 lr 0.000133 wd 0.0500 time 0.5734 (0.7110) data time 0.0009 (0.0101) model time 0.5725 (0.5837) loss 8.3486 (7.2073) grad_norm 4.5038 (2.5655) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-27 23:53:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][100/625] eta 0:06:05 lr 0.000133 wd 0.0500 time 0.5760 (0.6960) data time 0.0007 (0.0091) model time 0.5752 (0.5819) loss 7.2416 (7.1674) grad_norm 2.6730 (2.6382) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-27 23:53:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][110/625] eta 0:05:52 lr 0.000133 wd 0.0500 time 0.5749 (0.6840) data time 0.0010 (0.0082) model time 0.5738 (0.5808) loss 7.8647 (7.2036) grad_norm 1.7597 (2.6591) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-27 23:53:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][120/625] eta 0:05:40 lr 0.000133 wd 0.0500 time 0.5778 (0.6742) data time 0.0008 (0.0076) model time 0.5769 (0.5800) loss 7.1899 (7.2459) grad_norm 2.7513 (2.6567) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-27 23:53:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][130/625] eta 0:05:29 lr 0.000133 wd 0.0500 time 0.5738 (0.6660) data time 0.0007 (0.0070) model time 0.5732 (0.5793) loss 7.9279 (7.2394) grad_norm 2.0589 (2.6840) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-27 23:53:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][140/625] eta 0:05:19 lr 0.000132 wd 0.0500 time 0.5779 (0.6591) data time 0.0007 (0.0065) model time 0.5772 (0.5789) loss 7.1050 (7.2207) grad_norm 2.0127 (2.6644) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-27 23:53:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][150/625] eta 0:05:10 lr 0.000132 wd 0.0500 time 0.5782 (0.6533) data time 0.0007 (0.0061) model time 0.5775 (0.5787) loss 6.9496 (7.2095) grad_norm 2.3871 (2.6547) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-27 23:53:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-27 23:53:48 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-27 23:53:53 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 00:08:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-28 00:08:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-28 00:08:54 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-28 00:09:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-28 00:09:19 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-28 00:09:19 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-28 00:09:19 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-28 00:09:20 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 242) +[2024-07-28 00:09:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-28 00:09:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][160/625] eta 0:51:08 lr 0.000132 wd 0.0500 time 2.0351 (6.5988) data time 0.0010 (0.3785) model time 2.0340 (6.2203) loss 6.8963 (6.9082) grad_norm 2.7293 (3.8640) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:09:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][170/625] eta 0:12:11 lr 0.000132 wd 0.0500 time 0.6128 (1.6083) data time 0.0008 (0.0640) model time 0.6120 (1.5443) loss 6.5118 (7.1004) grad_norm 3.6710 (3.2183) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:09:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][180/625] eta 0:08:34 lr 0.000132 wd 0.0500 time 0.6029 (1.1552) data time 0.0011 (0.0355) model time 0.6018 (1.1197) loss 8.0103 (7.2122) grad_norm 2.3773 (2.8936) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:09:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][190/625] eta 0:07:07 lr 0.000132 wd 0.0500 time 0.6067 (0.9836) data time 0.0009 (0.0248) model time 0.6058 (0.9588) loss 7.4817 (7.2623) grad_norm 1.6945 (2.7539) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:10:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][200/625] eta 0:06:19 lr 0.000132 wd 0.0500 time 0.6065 (0.8937) data time 0.0010 (0.0192) model time 0.6055 (0.8745) loss 7.4162 (7.2316) grad_norm 2.5066 (3.1083) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:10:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][210/625] eta 0:05:49 lr 0.000132 wd 0.0500 time 0.5377 (0.8417) data time 0.0009 (0.0157) model time 0.5368 (0.8260) loss 5.9067 (7.2095) grad_norm 2.1359 (3.0889) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:10:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][220/625] eta 0:05:27 lr 0.000132 wd 0.0500 time 0.6143 (0.8081) data time 0.0007 (0.0133) model time 0.6136 (0.7948) loss 7.6181 (7.1747) grad_norm 3.1480 (3.0522) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:10:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][230/625] eta 0:05:08 lr 0.000132 wd 0.0500 time 0.6212 (0.7811) data time 0.0010 (0.0116) model time 0.6202 (0.7695) loss 7.9720 (7.1512) grad_norm 2.8213 (2.9849) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:10:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][240/625] eta 0:04:52 lr 0.000132 wd 0.0500 time 0.6186 (0.7609) data time 0.0010 (0.0103) model time 0.6176 (0.7505) loss 6.7827 (7.1456) grad_norm 2.2975 (2.9207) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:10:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][250/625] eta 0:04:39 lr 0.000132 wd 0.0500 time 0.6174 (0.7449) data time 0.0008 (0.0093) model time 0.6166 (0.7356) loss 6.1255 (7.1175) grad_norm 1.6989 (2.8051) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:10:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][260/625] eta 0:04:27 lr 0.000132 wd 0.0500 time 0.6082 (0.7316) data time 0.0008 (0.0085) model time 0.6074 (0.7231) loss 7.3524 (7.1385) grad_norm 1.7441 (2.7557) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:10:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][270/625] eta 0:04:15 lr 0.000132 wd 0.0500 time 0.6130 (0.7206) data time 0.0011 (0.0078) model time 0.6120 (0.7128) loss 7.3980 (7.1376) grad_norm 3.4795 (2.7295) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:10:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][280/625] eta 0:04:05 lr 0.000132 wd 0.0500 time 0.6095 (0.7114) data time 0.0008 (0.0073) model time 0.6088 (0.7041) loss 7.5794 (7.1354) grad_norm 1.6067 (2.7387) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:10:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][290/625] eta 0:03:55 lr 0.000132 wd 0.0500 time 0.6125 (0.7035) data time 0.0011 (0.0068) model time 0.6114 (0.6967) loss 7.1694 (7.1167) grad_norm 2.5782 (2.7431) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:11:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][300/625] eta 0:03:46 lr 0.000131 wd 0.0500 time 0.6211 (0.6973) data time 0.0009 (0.0064) model time 0.6201 (0.6909) loss 7.3396 (7.0967) grad_norm 2.7926 (2.8561) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:11:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][310/625] eta 0:03:37 lr 0.000131 wd 0.0500 time 0.6167 (0.6919) data time 0.0011 (0.0061) model time 0.6156 (0.6858) loss 8.4651 (7.1054) grad_norm 1.7952 (2.8169) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:11:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][320/625] eta 0:03:29 lr 0.000131 wd 0.0500 time 0.6166 (0.6873) data time 0.0010 (0.0058) model time 0.6156 (0.6815) loss 7.4326 (7.1091) grad_norm 2.7109 (2.7850) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:11:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][330/625] eta 0:03:21 lr 0.000131 wd 0.0500 time 0.6109 (0.6829) data time 0.0010 (0.0055) model time 0.6099 (0.6774) loss 6.5659 (7.1046) grad_norm 1.9022 (2.7456) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:11:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][340/625] eta 0:03:13 lr 0.000131 wd 0.0500 time 0.6083 (0.6788) data time 0.0010 (0.0052) model time 0.6074 (0.6736) loss 8.1112 (7.0978) grad_norm 2.4886 (2.7469) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:11:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][350/625] eta 0:03:05 lr 0.000131 wd 0.0500 time 0.6082 (0.6751) data time 0.0012 (0.0050) model time 0.6070 (0.6701) loss 7.2503 (7.0940) grad_norm 3.4913 (2.8042) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:11:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][360/625] eta 0:02:58 lr 0.000131 wd 0.0500 time 0.6092 (0.6718) data time 0.0008 (0.0048) model time 0.6084 (0.6670) loss 7.0326 (7.0776) grad_norm 2.2919 (2.7739) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:11:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][370/625] eta 0:02:50 lr 0.000131 wd 0.0500 time 0.6177 (0.6692) data time 0.0010 (0.0047) model time 0.6167 (0.6645) loss 7.4490 (7.0716) grad_norm 3.1450 (2.7672) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:11:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][380/625] eta 0:02:43 lr 0.000131 wd 0.0500 time 0.6138 (0.6667) data time 0.0009 (0.0045) model time 0.6129 (0.6622) loss 7.2456 (7.0551) grad_norm 2.3658 (2.7753) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:11:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][390/625] eta 0:02:36 lr 0.000131 wd 0.0500 time 0.6205 (0.6645) data time 0.0009 (0.0044) model time 0.6196 (0.6601) loss 6.0733 (7.0588) grad_norm 3.7420 (2.7973) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:12:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][400/625] eta 0:02:29 lr 0.000131 wd 0.0500 time 0.6047 (0.6624) data time 0.0009 (0.0042) model time 0.6038 (0.6582) loss 7.4438 (7.0593) grad_norm 1.8177 (2.8316) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:12:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][410/625] eta 0:02:21 lr 0.000131 wd 0.0500 time 0.6173 (0.6603) data time 0.0008 (0.0041) model time 0.6165 (0.6562) loss 7.4233 (7.0440) grad_norm 1.8990 (2.8120) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:12:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][420/625] eta 0:02:14 lr 0.000131 wd 0.0500 time 0.6117 (0.6583) data time 0.0010 (0.0040) model time 0.6107 (0.6543) loss 7.4766 (7.0382) grad_norm 1.9811 (2.7964) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:12:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][430/625] eta 0:02:08 lr 0.000131 wd 0.0500 time 0.7286 (0.6570) data time 0.0012 (0.0039) model time 0.7275 (0.6531) loss 6.9078 (7.0313) grad_norm 2.0665 (2.7745) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:12:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][440/625] eta 0:02:01 lr 0.000131 wd 0.0500 time 0.6181 (0.6562) data time 0.0010 (0.0038) model time 0.6171 (0.6524) loss 6.0068 (7.0349) grad_norm 1.6708 (2.7540) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:12:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][450/625] eta 0:01:54 lr 0.000131 wd 0.0500 time 0.6156 (0.6547) data time 0.0010 (0.0037) model time 0.6146 (0.6511) loss 7.1109 (7.0345) grad_norm 2.0617 (2.7765) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:12:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][460/625] eta 0:01:47 lr 0.000130 wd 0.0500 time 0.6123 (0.6534) data time 0.0011 (0.0036) model time 0.6113 (0.6498) loss 7.3372 (7.0268) grad_norm 2.0293 (2.7545) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:12:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][470/625] eta 0:01:41 lr 0.000130 wd 0.0500 time 0.6155 (0.6522) data time 0.0010 (0.0035) model time 0.6145 (0.6487) loss 7.7127 (7.0309) grad_norm 1.9812 (2.7488) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:12:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][480/625] eta 0:01:34 lr 0.000130 wd 0.0500 time 0.6125 (0.6509) data time 0.0010 (0.0034) model time 0.6115 (0.6475) loss 7.4672 (7.0476) grad_norm 1.7743 (2.7310) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:12:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][490/625] eta 0:01:27 lr 0.000130 wd 0.0500 time 0.6099 (0.6497) data time 0.0010 (0.0034) model time 0.6089 (0.6463) loss 7.8504 (7.0460) grad_norm 3.2314 (2.7430) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:13:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][500/625] eta 0:01:21 lr 0.000130 wd 0.0500 time 0.6103 (0.6485) data time 0.0008 (0.0033) model time 0.6095 (0.6452) loss 7.8151 (7.0565) grad_norm 1.7438 (2.7380) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:13:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][510/625] eta 0:01:14 lr 0.000130 wd 0.0500 time 0.6139 (0.6474) data time 0.0008 (0.0032) model time 0.6131 (0.6442) loss 7.7079 (7.0617) grad_norm 1.9843 (2.7306) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:13:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][520/625] eta 0:01:07 lr 0.000130 wd 0.0500 time 0.6167 (0.6465) data time 0.0008 (0.0032) model time 0.6160 (0.6434) loss 7.7250 (7.0646) grad_norm 3.5956 (2.7262) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:13:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][530/625] eta 0:01:01 lr 0.000130 wd 0.0500 time 0.6172 (0.6457) data time 0.0010 (0.0031) model time 0.6162 (0.6426) loss 7.1412 (7.0558) grad_norm 5.2373 (2.7397) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:13:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][540/625] eta 0:00:54 lr 0.000130 wd 0.0500 time 0.6143 (0.6449) data time 0.0009 (0.0030) model time 0.6135 (0.6419) loss 7.7669 (7.0537) grad_norm 2.1690 (2.7316) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:13:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][550/625] eta 0:00:48 lr 0.000130 wd 0.0500 time 0.6076 (0.6441) data time 0.0009 (0.0030) model time 0.6067 (0.6411) loss 6.1413 (7.0462) grad_norm 2.5740 (2.7451) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:13:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][560/625] eta 0:00:41 lr 0.000130 wd 0.0500 time 0.6094 (0.6432) data time 0.0008 (0.0029) model time 0.6086 (0.6402) loss 6.8142 (7.0447) grad_norm 3.0474 (2.7464) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:13:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][570/625] eta 0:00:35 lr 0.000130 wd 0.0500 time 0.6093 (0.6423) data time 0.0010 (0.0029) model time 0.6083 (0.6394) loss 6.6744 (7.0444) grad_norm 2.2030 (2.7440) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:13:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][580/625] eta 0:00:28 lr 0.000130 wd 0.0500 time 0.6111 (0.6415) data time 0.0008 (0.0029) model time 0.6103 (0.6387) loss 7.0195 (7.0423) grad_norm 1.8518 (2.7419) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:14:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][590/625] eta 0:00:22 lr 0.000130 wd 0.0500 time 0.6134 (0.6409) data time 0.0007 (0.0028) model time 0.6126 (0.6381) loss 6.2440 (7.0465) grad_norm 2.7946 (2.7348) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:14:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][600/625] eta 0:00:16 lr 0.000130 wd 0.0500 time 0.6142 (0.6403) data time 0.0008 (0.0028) model time 0.6134 (0.6375) loss 6.8243 (7.0485) grad_norm 2.9697 (2.7294) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:14:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][610/625] eta 0:00:09 lr 0.000129 wd 0.0500 time 0.6116 (0.6397) data time 0.0007 (0.0027) model time 0.6109 (0.6370) loss 6.7496 (7.0435) grad_norm 3.0989 (2.7255) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:14:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [242/300][620/625] eta 0:00:03 lr 0.000129 wd 0.0500 time 0.6082 (0.6391) data time 0.0005 (0.0027) model time 0.6077 (0.6364) loss 5.7221 (7.0342) grad_norm 1.9459 (2.7126) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 00:14:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 242 training takes 0:04:57 +[2024-07-28 00:14:21 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 00:14:27 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 00:14:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.528 (0.528) Loss 0.4958 (0.4958) Acc@1 90.527 (90.527) Acc@5 99.023 (99.023) Mem 22344MB +[2024-07-28 00:14:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.163) Loss 0.7510 (0.6083) Acc@1 83.105 (87.944) Acc@5 96.924 (98.100) Mem 22344MB +[2024-07-28 00:14:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.145) Loss 0.8545 (0.6966) Acc@1 79.639 (85.238) Acc@5 96.143 (97.301) Mem 22344MB +[2024-07-28 00:14:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.873 Acc@5 97.309 +[2024-07-28 00:14:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.9% +[2024-07-28 00:14:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.855 (0.855) Loss 0.5039 (0.5039) Acc@1 90.332 (90.332) Acc@5 98.975 (98.975) Mem 22344MB +[2024-07-28 00:14:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.193) Loss 0.7437 (0.6129) Acc@1 83.203 (87.953) Acc@5 96.973 (98.100) Mem 22344MB +[2024-07-28 00:14:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.161) Loss 0.8442 (0.6988) Acc@1 80.078 (85.177) Acc@5 96.143 (97.294) Mem 22344MB +[2024-07-28 00:14:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.807 Acc@5 97.295 +[2024-07-28 00:14:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.8% +[2024-07-28 00:14:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.81% +[2024-07-28 00:14:36 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-28 00:14:41 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-28 00:14:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][0/625] eta 0:12:19 lr 0.000129 wd 0.0500 time 1.1835 (1.1835) data time 0.4641 (0.4641) model time 0.0000 (0.0000) loss 6.3494 (6.3494) grad_norm 1.9448 (1.9448) loss_scale 128.0000 (128.0000) mem 22337MB +[2024-07-28 00:14:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][10/625] eta 0:06:47 lr 0.000129 wd 0.0500 time 0.6140 (0.6632) data time 0.0008 (0.0431) model time 0.0000 (0.0000) loss 7.2923 (6.5552) grad_norm 3.5299 (2.5782) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:14:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][20/625] eta 0:06:26 lr 0.000129 wd 0.0500 time 0.6121 (0.6387) data time 0.0010 (0.0231) model time 0.0000 (0.0000) loss 6.8727 (6.9333) grad_norm 2.4321 (2.4098) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:15:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][30/625] eta 0:06:17 lr 0.000129 wd 0.0500 time 0.6091 (0.6345) data time 0.0008 (0.0160) model time 0.0000 (0.0000) loss 6.6430 (6.9779) grad_norm 1.6772 (2.4903) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:15:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][40/625] eta 0:06:07 lr 0.000129 wd 0.0500 time 0.6059 (0.6276) data time 0.0009 (0.0123) model time 0.0000 (0.0000) loss 7.8226 (6.9909) grad_norm 1.9610 (2.4403) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:15:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][50/625] eta 0:05:58 lr 0.000129 wd 0.0500 time 0.6085 (0.6236) data time 0.0009 (0.0101) model time 0.0000 (0.0000) loss 6.2752 (7.0349) grad_norm 2.6786 (2.4790) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:15:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][60/625] eta 0:05:50 lr 0.000129 wd 0.0500 time 0.6129 (0.6212) data time 0.0010 (0.0086) model time 0.6119 (0.6077) loss 7.0284 (7.0240) grad_norm 2.0026 (2.4321) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:15:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][70/625] eta 0:05:43 lr 0.000129 wd 0.0500 time 0.6153 (0.6198) data time 0.0008 (0.0075) model time 0.6146 (0.6090) loss 6.2908 (6.9748) grad_norm 1.7209 (2.3836) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:15:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][80/625] eta 0:05:37 lr 0.000129 wd 0.0500 time 0.6136 (0.6188) data time 0.0007 (0.0067) model time 0.6128 (0.6096) loss 7.3628 (6.9682) grad_norm 2.2009 (2.5889) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:15:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][90/625] eta 0:05:30 lr 0.000129 wd 0.0500 time 0.6116 (0.6181) data time 0.0007 (0.0061) model time 0.6109 (0.6100) loss 5.4820 (6.9941) grad_norm 2.5480 (2.7786) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:15:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][100/625] eta 0:05:23 lr 0.000129 wd 0.0500 time 0.6062 (0.6171) data time 0.0010 (0.0056) model time 0.6052 (0.6094) loss 7.4754 (7.0322) grad_norm 2.2838 (2.8184) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:15:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][110/625] eta 0:05:17 lr 0.000129 wd 0.0500 time 0.6081 (0.6164) data time 0.0009 (0.0052) model time 0.6072 (0.6091) loss 6.7553 (7.0321) grad_norm 2.2785 (2.7730) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:15:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][120/625] eta 0:05:10 lr 0.000129 wd 0.0500 time 0.6123 (0.6158) data time 0.0008 (0.0049) model time 0.6115 (0.6090) loss 8.0793 (7.0640) grad_norm 2.4289 (2.7667) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:16:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][130/625] eta 0:05:04 lr 0.000129 wd 0.0500 time 0.6123 (0.6152) data time 0.0009 (0.0046) model time 0.6115 (0.6088) loss 7.5092 (7.0676) grad_norm 2.5913 (2.7582) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:16:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][140/625] eta 0:04:58 lr 0.000129 wd 0.0500 time 0.6087 (0.6148) data time 0.0011 (0.0043) model time 0.6076 (0.6087) loss 7.6509 (7.0642) grad_norm 2.4194 (2.7070) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:16:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][150/625] eta 0:04:51 lr 0.000128 wd 0.0500 time 0.6235 (0.6146) data time 0.0010 (0.0041) model time 0.6225 (0.6090) loss 7.7782 (7.0645) grad_norm 1.6834 (2.6679) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:16:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][160/625] eta 0:04:45 lr 0.000128 wd 0.0500 time 0.6141 (0.6144) data time 0.0008 (0.0039) model time 0.6134 (0.6091) loss 6.6205 (7.0629) grad_norm 1.9996 (2.6173) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:16:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][170/625] eta 0:04:39 lr 0.000128 wd 0.0500 time 0.6088 (0.6143) data time 0.0009 (0.0038) model time 0.6080 (0.6092) loss 6.2692 (7.0669) grad_norm 9.7513 (2.6373) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:16:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][180/625] eta 0:04:33 lr 0.000128 wd 0.0500 time 0.6101 (0.6141) data time 0.0008 (0.0036) model time 0.6093 (0.6093) loss 6.7131 (7.0449) grad_norm 2.6625 (2.6343) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:16:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][190/625] eta 0:04:27 lr 0.000128 wd 0.0500 time 0.6107 (0.6146) data time 0.0010 (0.0035) model time 0.6097 (0.6103) loss 5.7576 (7.0244) grad_norm 2.3534 (2.6619) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:16:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][200/625] eta 0:04:21 lr 0.000128 wd 0.0500 time 0.6140 (0.6144) data time 0.0009 (0.0033) model time 0.6130 (0.6103) loss 7.4306 (7.0283) grad_norm 2.3959 (2.6688) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:16:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][210/625] eta 0:04:14 lr 0.000128 wd 0.0500 time 0.6093 (0.6143) data time 0.0007 (0.0032) model time 0.6086 (0.6102) loss 6.6486 (7.0381) grad_norm 1.6728 (2.6518) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:16:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][220/625] eta 0:04:08 lr 0.000128 wd 0.0500 time 0.6175 (0.6144) data time 0.0008 (0.0031) model time 0.6168 (0.6106) loss 7.1954 (7.0438) grad_norm 2.2773 (2.6540) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:17:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][230/625] eta 0:04:02 lr 0.000128 wd 0.0500 time 0.6140 (0.6143) data time 0.0010 (0.0030) model time 0.6130 (0.6106) loss 6.0505 (7.0213) grad_norm 1.8837 (2.6443) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:17:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][240/625] eta 0:03:56 lr 0.000128 wd 0.0500 time 0.6173 (0.6144) data time 0.0007 (0.0030) model time 0.6166 (0.6109) loss 6.0333 (7.0148) grad_norm 2.2539 (2.6340) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:17:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][250/625] eta 0:03:50 lr 0.000128 wd 0.0500 time 0.6083 (0.6149) data time 0.0010 (0.0029) model time 0.6073 (0.6116) loss 5.4949 (6.9961) grad_norm 2.5193 (2.6172) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:17:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][260/625] eta 0:03:44 lr 0.000128 wd 0.0500 time 0.6069 (0.6146) data time 0.0008 (0.0028) model time 0.6060 (0.6114) loss 5.7627 (6.9880) grad_norm 3.3455 (2.6149) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:17:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][270/625] eta 0:03:38 lr 0.000128 wd 0.0500 time 0.6090 (0.6148) data time 0.0007 (0.0027) model time 0.6083 (0.6117) loss 7.7998 (7.0095) grad_norm 3.4817 (2.6544) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:17:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][280/625] eta 0:03:32 lr 0.000128 wd 0.0500 time 0.6174 (0.6147) data time 0.0007 (0.0027) model time 0.6167 (0.6117) loss 7.2799 (7.0185) grad_norm 2.1743 (2.6455) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:17:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][290/625] eta 0:03:25 lr 0.000128 wd 0.0500 time 0.6239 (0.6147) data time 0.0009 (0.0026) model time 0.6230 (0.6118) loss 7.2611 (7.0121) grad_norm 2.4265 (2.6343) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:17:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][300/625] eta 0:03:19 lr 0.000127 wd 0.0500 time 0.6196 (0.6146) data time 0.0010 (0.0026) model time 0.6186 (0.6118) loss 6.0588 (7.0226) grad_norm 1.6950 (2.6307) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:17:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][310/625] eta 0:03:13 lr 0.000127 wd 0.0500 time 0.6118 (0.6146) data time 0.0010 (0.0025) model time 0.6108 (0.6118) loss 6.1818 (7.0305) grad_norm 1.7277 (2.6249) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:17:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][320/625] eta 0:03:07 lr 0.000127 wd 0.0500 time 0.6093 (0.6145) data time 0.0008 (0.0025) model time 0.6085 (0.6118) loss 7.2626 (7.0277) grad_norm 2.9356 (2.6153) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:18:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][330/625] eta 0:03:01 lr 0.000127 wd 0.0500 time 0.6118 (0.6144) data time 0.0008 (0.0024) model time 0.6110 (0.6117) loss 7.7364 (7.0351) grad_norm 1.9004 (2.6092) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:18:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][340/625] eta 0:02:55 lr 0.000127 wd 0.0500 time 0.6099 (0.6142) data time 0.0008 (0.0024) model time 0.6092 (0.6115) loss 6.5900 (7.0222) grad_norm 2.4097 (2.5952) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:18:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][350/625] eta 0:02:48 lr 0.000127 wd 0.0500 time 0.6129 (0.6142) data time 0.0010 (0.0024) model time 0.6119 (0.6116) loss 8.3685 (7.0271) grad_norm 1.7187 (2.5856) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:18:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][360/625] eta 0:02:42 lr 0.000127 wd 0.0500 time 0.6092 (0.6142) data time 0.0008 (0.0023) model time 0.6084 (0.6116) loss 5.9132 (7.0232) grad_norm 1.6938 (2.5823) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:18:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][370/625] eta 0:02:36 lr 0.000127 wd 0.0500 time 0.6165 (0.6141) data time 0.0010 (0.0023) model time 0.6156 (0.6116) loss 6.0499 (7.0179) grad_norm 2.0202 (2.5715) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:18:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][380/625] eta 0:02:30 lr 0.000127 wd 0.0500 time 0.6128 (0.6140) data time 0.0010 (0.0023) model time 0.6118 (0.6115) loss 7.3072 (7.0108) grad_norm 2.1830 (2.5649) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:18:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][390/625] eta 0:02:24 lr 0.000127 wd 0.0500 time 0.6090 (0.6140) data time 0.0010 (0.0022) model time 0.6080 (0.6115) loss 7.3763 (7.0166) grad_norm 2.2991 (2.5555) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:18:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][400/625] eta 0:02:18 lr 0.000127 wd 0.0500 time 0.6168 (0.6139) data time 0.0007 (0.0022) model time 0.6161 (0.6114) loss 6.4046 (7.0130) grad_norm 1.9565 (2.5509) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:18:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][410/625] eta 0:02:12 lr 0.000127 wd 0.0500 time 0.7347 (0.6141) data time 0.0008 (0.0022) model time 0.7339 (0.6117) loss 6.2619 (7.0133) grad_norm 2.8342 (2.5533) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:18:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][420/625] eta 0:02:05 lr 0.000127 wd 0.0500 time 0.6106 (0.6139) data time 0.0008 (0.0021) model time 0.6099 (0.6116) loss 7.2776 (7.0121) grad_norm 2.7875 (2.5546) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:19:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][430/625] eta 0:01:59 lr 0.000127 wd 0.0500 time 0.6131 (0.6139) data time 0.0007 (0.0021) model time 0.6123 (0.6115) loss 6.9878 (7.0130) grad_norm 2.0193 (2.6231) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:19:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][440/625] eta 0:01:53 lr 0.000127 wd 0.0500 time 0.6153 (0.6139) data time 0.0008 (0.0021) model time 0.6146 (0.6116) loss 6.3092 (7.0156) grad_norm 2.0286 (2.6147) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:19:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][450/625] eta 0:01:47 lr 0.000127 wd 0.0500 time 0.6127 (0.6138) data time 0.0010 (0.0020) model time 0.6117 (0.6116) loss 6.6188 (7.0133) grad_norm 2.0501 (2.6064) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:19:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][460/625] eta 0:01:41 lr 0.000126 wd 0.0500 time 0.6087 (0.6138) data time 0.0010 (0.0020) model time 0.6077 (0.6116) loss 7.3099 (7.0184) grad_norm 2.0858 (2.6299) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:19:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][470/625] eta 0:01:35 lr 0.000126 wd 0.0500 time 0.6073 (0.6137) data time 0.0010 (0.0020) model time 0.6063 (0.6115) loss 8.1668 (7.0204) grad_norm 1.9766 (2.6177) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:19:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][480/625] eta 0:01:29 lr 0.000126 wd 0.0500 time 0.6096 (0.6139) data time 0.0008 (0.0020) model time 0.6088 (0.6117) loss 6.5212 (7.0167) grad_norm 2.0207 (2.6103) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:19:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][490/625] eta 0:01:22 lr 0.000126 wd 0.0500 time 0.6079 (0.6138) data time 0.0010 (0.0020) model time 0.6069 (0.6117) loss 6.6853 (7.0131) grad_norm 2.0185 (2.6020) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:19:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][500/625] eta 0:01:16 lr 0.000126 wd 0.0500 time 0.6186 (0.6138) data time 0.0011 (0.0019) model time 0.6175 (0.6116) loss 7.0434 (7.0133) grad_norm 2.1422 (2.6016) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:19:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][510/625] eta 0:01:10 lr 0.000126 wd 0.0500 time 0.6110 (0.6137) data time 0.0011 (0.0019) model time 0.6098 (0.6116) loss 6.2257 (7.0202) grad_norm 1.7625 (2.5966) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:20:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][520/625] eta 0:01:04 lr 0.000126 wd 0.0500 time 0.6117 (0.6137) data time 0.0008 (0.0019) model time 0.6109 (0.6116) loss 7.0403 (7.0200) grad_norm 2.8949 (2.5955) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:20:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][530/625] eta 0:00:58 lr 0.000126 wd 0.0500 time 0.6118 (0.6136) data time 0.0008 (0.0019) model time 0.6110 (0.6115) loss 6.0546 (7.0189) grad_norm 3.8528 (2.6020) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:20:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][540/625] eta 0:00:52 lr 0.000126 wd 0.0500 time 0.6110 (0.6135) data time 0.0010 (0.0019) model time 0.6100 (0.6115) loss 6.4486 (7.0168) grad_norm 1.7934 (2.6012) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:20:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][550/625] eta 0:00:46 lr 0.000126 wd 0.0500 time 0.6120 (0.6135) data time 0.0010 (0.0019) model time 0.6110 (0.6114) loss 6.3837 (7.0219) grad_norm 2.3750 (2.6089) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:20:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][560/625] eta 0:00:39 lr 0.000126 wd 0.0500 time 0.6077 (0.6134) data time 0.0010 (0.0019) model time 0.6067 (0.6113) loss 7.0163 (7.0174) grad_norm 2.0093 (2.6394) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:20:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][570/625] eta 0:00:33 lr 0.000126 wd 0.0500 time 0.6127 (0.6133) data time 0.0010 (0.0018) model time 0.6118 (0.6113) loss 7.2547 (7.0110) grad_norm 2.5767 (2.6381) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:20:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][580/625] eta 0:00:27 lr 0.000126 wd 0.0500 time 0.6135 (0.6133) data time 0.0008 (0.0018) model time 0.6127 (0.6112) loss 6.9685 (7.0095) grad_norm 2.1955 (2.6831) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:20:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][590/625] eta 0:00:21 lr 0.000126 wd 0.0500 time 0.6131 (0.6132) data time 0.0010 (0.0018) model time 0.6121 (0.6112) loss 7.1454 (7.0135) grad_norm 2.6566 (2.6837) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:20:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][600/625] eta 0:00:15 lr 0.000126 wd 0.0500 time 0.6121 (0.6132) data time 0.0010 (0.0018) model time 0.6111 (0.6112) loss 6.4373 (7.0087) grad_norm 2.6710 (2.6991) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:20:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][610/625] eta 0:00:09 lr 0.000126 wd 0.0500 time 0.6081 (0.6132) data time 0.0008 (0.0018) model time 0.6073 (0.6112) loss 8.3815 (7.0099) grad_norm 2.2942 (2.7012) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:21:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [243/300][620/625] eta 0:00:03 lr 0.000125 wd 0.0500 time 0.6099 (0.6131) data time 0.0005 (0.0018) model time 0.6094 (0.6111) loss 5.9910 (7.0081) grad_norm 2.5742 (2.7029) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:21:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 243 training takes 0:06:23 +[2024-07-28 00:21:04 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 00:21:06 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 00:21:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.522 (0.522) Loss 0.4868 (0.4868) Acc@1 90.332 (90.332) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-28 00:21:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.162) Loss 0.7456 (0.5963) Acc@1 83.057 (87.984) Acc@5 96.924 (98.082) Mem 22339MB +[2024-07-28 00:21:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.145) Loss 0.8398 (0.6871) Acc@1 80.518 (85.263) Acc@5 95.996 (97.280) Mem 22339MB +[2024-07-28 00:21:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.877 Acc@5 97.295 +[2024-07-28 00:21:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.9% +[2024-07-28 00:21:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.852 (0.852) Loss 0.5039 (0.5039) Acc@1 90.381 (90.381) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-28 00:21:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.193) Loss 0.7432 (0.6125) Acc@1 83.203 (87.997) Acc@5 96.973 (98.105) Mem 22339MB +[2024-07-28 00:21:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.161) Loss 0.8438 (0.6983) Acc@1 80.176 (85.193) Acc@5 96.094 (97.298) Mem 22339MB +[2024-07-28 00:21:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.823 Acc@5 97.299 +[2024-07-28 00:21:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.8% +[2024-07-28 00:21:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.82% +[2024-07-28 00:21:13 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-28 00:21:15 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-28 00:21:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][0/625] eta 0:10:07 lr 0.000125 wd 0.0500 time 0.9725 (0.9725) data time 0.4372 (0.4372) model time 0.0000 (0.0000) loss 6.8129 (6.8129) grad_norm 3.6747 (3.6747) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:21:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][10/625] eta 0:06:35 lr 0.000125 wd 0.0500 time 0.6038 (0.6423) data time 0.0011 (0.0407) model time 0.0000 (0.0000) loss 6.6660 (7.1262) grad_norm 1.8519 (2.4038) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:21:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][20/625] eta 0:06:20 lr 0.000125 wd 0.0500 time 0.6128 (0.6287) data time 0.0010 (0.0218) model time 0.0000 (0.0000) loss 7.3283 (7.1497) grad_norm 2.3840 (2.3820) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:21:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][30/625] eta 0:06:11 lr 0.000125 wd 0.0500 time 0.6103 (0.6240) data time 0.0007 (0.0151) model time 0.0000 (0.0000) loss 6.1898 (7.2651) grad_norm 2.3160 (2.3755) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:21:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][40/625] eta 0:06:03 lr 0.000125 wd 0.0500 time 0.6091 (0.6216) data time 0.0008 (0.0116) model time 0.0000 (0.0000) loss 6.1554 (7.2540) grad_norm 2.5166 (2.4602) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:21:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][50/625] eta 0:05:56 lr 0.000125 wd 0.0500 time 0.6023 (0.6192) data time 0.0011 (0.0096) model time 0.0000 (0.0000) loss 6.8465 (7.1759) grad_norm 2.2989 (2.4494) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:21:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][60/625] eta 0:05:49 lr 0.000125 wd 0.0500 time 0.6063 (0.6178) data time 0.0010 (0.0082) model time 0.6053 (0.6090) loss 8.0829 (7.1522) grad_norm 1.8834 (2.4144) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:21:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][70/625] eta 0:05:43 lr 0.000125 wd 0.0500 time 0.6011 (0.6193) data time 0.0007 (0.0072) model time 0.6004 (0.6182) loss 6.1629 (7.1233) grad_norm 3.0096 (2.5210) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:22:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][80/625] eta 0:05:36 lr 0.000125 wd 0.0500 time 0.6140 (0.6182) data time 0.0008 (0.0064) model time 0.6132 (0.6154) loss 6.3008 (7.0848) grad_norm 1.9770 (2.5129) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:22:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][90/625] eta 0:05:30 lr 0.000125 wd 0.0500 time 0.6110 (0.6176) data time 0.0008 (0.0058) model time 0.6102 (0.6144) loss 7.8245 (7.1282) grad_norm 2.2806 (2.4964) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:22:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][100/625] eta 0:05:23 lr 0.000125 wd 0.0500 time 0.6059 (0.6171) data time 0.0010 (0.0054) model time 0.6048 (0.6138) loss 6.3461 (7.1399) grad_norm 1.7661 (2.4770) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:22:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][110/625] eta 0:05:17 lr 0.000125 wd 0.0500 time 0.6104 (0.6168) data time 0.0010 (0.0050) model time 0.6094 (0.6136) loss 6.2363 (7.1235) grad_norm 6.6584 (2.4892) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:22:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][120/625] eta 0:05:11 lr 0.000125 wd 0.0500 time 0.6042 (0.6163) data time 0.0008 (0.0049) model time 0.6035 (0.6128) loss 8.4652 (7.0901) grad_norm 2.7750 (2.4868) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:22:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][130/625] eta 0:05:04 lr 0.000125 wd 0.0500 time 0.6099 (0.6158) data time 0.0010 (0.0046) model time 0.6089 (0.6122) loss 9.3631 (7.0815) grad_norm 2.4281 (2.4916) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:22:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][140/625] eta 0:04:58 lr 0.000125 wd 0.0500 time 0.6041 (0.6154) data time 0.0009 (0.0043) model time 0.6033 (0.6118) loss 5.9228 (7.0769) grad_norm 2.3637 (2.4771) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:22:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][150/625] eta 0:04:52 lr 0.000125 wd 0.0500 time 0.6022 (0.6148) data time 0.0008 (0.0041) model time 0.6014 (0.6113) loss 7.8409 (7.0786) grad_norm 2.9630 (2.4723) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:22:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][160/625] eta 0:04:45 lr 0.000124 wd 0.0500 time 0.6073 (0.6146) data time 0.0010 (0.0039) model time 0.6063 (0.6112) loss 6.8445 (7.0563) grad_norm 2.1911 (2.4519) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:23:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][170/625] eta 0:04:39 lr 0.000124 wd 0.0500 time 0.6041 (0.6144) data time 0.0008 (0.0038) model time 0.6033 (0.6111) loss 7.1213 (7.0686) grad_norm 2.6860 (2.4739) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:23:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][180/625] eta 0:04:33 lr 0.000124 wd 0.0500 time 0.6059 (0.6150) data time 0.0010 (0.0036) model time 0.6049 (0.6121) loss 7.1806 (7.0816) grad_norm 2.1747 (2.4723) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:23:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][190/625] eta 0:04:27 lr 0.000124 wd 0.0500 time 0.6045 (0.6149) data time 0.0009 (0.0035) model time 0.6035 (0.6121) loss 7.3353 (7.0985) grad_norm 2.3413 (2.4634) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:23:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][200/625] eta 0:04:21 lr 0.000124 wd 0.0500 time 0.6032 (0.6147) data time 0.0010 (0.0034) model time 0.6022 (0.6118) loss 6.6051 (7.0979) grad_norm 2.9251 (2.4707) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:23:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][210/625] eta 0:04:14 lr 0.000124 wd 0.0500 time 0.6088 (0.6144) data time 0.0008 (0.0032) model time 0.6079 (0.6117) loss 6.6177 (7.1031) grad_norm 2.3967 (2.4774) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:23:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][220/625] eta 0:04:08 lr 0.000124 wd 0.0500 time 0.5989 (0.6142) data time 0.0010 (0.0031) model time 0.5979 (0.6114) loss 6.6675 (7.0976) grad_norm 2.9319 (2.4638) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:23:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][230/625] eta 0:04:02 lr 0.000124 wd 0.0500 time 0.6089 (0.6141) data time 0.0008 (0.0031) model time 0.6081 (0.6114) loss 5.7927 (7.0779) grad_norm 2.3154 (2.4530) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:23:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][240/625] eta 0:03:56 lr 0.000124 wd 0.0500 time 0.6042 (0.6141) data time 0.0008 (0.0030) model time 0.6034 (0.6115) loss 6.8399 (7.0885) grad_norm 2.2461 (2.4804) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:23:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][250/625] eta 0:03:50 lr 0.000124 wd 0.0500 time 0.6072 (0.6140) data time 0.0011 (0.0029) model time 0.6061 (0.6115) loss 5.3935 (7.0826) grad_norm 2.1442 (2.4946) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:23:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][260/625] eta 0:03:44 lr 0.000124 wd 0.0500 time 0.6081 (0.6140) data time 0.0008 (0.0028) model time 0.6073 (0.6115) loss 6.7846 (7.0816) grad_norm 2.0812 (2.4901) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:24:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][270/625] eta 0:03:37 lr 0.000124 wd 0.0500 time 0.6040 (0.6139) data time 0.0010 (0.0028) model time 0.6031 (0.6115) loss 6.8452 (7.0918) grad_norm 2.3399 (2.4894) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:24:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][280/625] eta 0:03:31 lr 0.000124 wd 0.0500 time 0.6034 (0.6137) data time 0.0010 (0.0027) model time 0.6024 (0.6113) loss 8.0973 (7.1083) grad_norm 2.5686 (2.5106) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:24:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][290/625] eta 0:03:25 lr 0.000124 wd 0.0500 time 0.5993 (0.6143) data time 0.0011 (0.0027) model time 0.5982 (0.6121) loss 6.4989 (7.1117) grad_norm 2.2688 (2.5384) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:24:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][300/625] eta 0:03:19 lr 0.000124 wd 0.0500 time 0.6073 (0.6142) data time 0.0008 (0.0026) model time 0.6065 (0.6120) loss 6.5317 (7.1129) grad_norm 2.3133 (2.5417) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:24:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][310/625] eta 0:03:13 lr 0.000124 wd 0.0500 time 0.6111 (0.6142) data time 0.0008 (0.0026) model time 0.6103 (0.6120) loss 7.1294 (7.1193) grad_norm 3.2657 (2.5599) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:24:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][320/625] eta 0:03:07 lr 0.000123 wd 0.0500 time 0.6060 (0.6141) data time 0.0010 (0.0025) model time 0.6050 (0.6120) loss 6.4838 (7.1178) grad_norm 3.1229 (2.5585) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:24:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][330/625] eta 0:03:01 lr 0.000123 wd 0.0500 time 0.6054 (0.6142) data time 0.0011 (0.0025) model time 0.6043 (0.6121) loss 8.1775 (7.1219) grad_norm 2.2786 (2.5475) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:24:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][340/625] eta 0:02:55 lr 0.000123 wd 0.0500 time 0.6089 (0.6141) data time 0.0009 (0.0024) model time 0.6080 (0.6120) loss 7.6558 (7.1166) grad_norm 1.9734 (2.5404) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:24:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][350/625] eta 0:02:48 lr 0.000123 wd 0.0500 time 0.6046 (0.6140) data time 0.0008 (0.0024) model time 0.6038 (0.6120) loss 7.6767 (7.1146) grad_norm 2.1311 (2.5451) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:24:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][360/625] eta 0:02:42 lr 0.000123 wd 0.0500 time 0.6049 (0.6139) data time 0.0010 (0.0023) model time 0.6039 (0.6119) loss 8.2526 (7.1187) grad_norm 3.8839 (2.5543) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:25:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][370/625] eta 0:02:36 lr 0.000123 wd 0.0500 time 0.6011 (0.6137) data time 0.0008 (0.0023) model time 0.6003 (0.6117) loss 7.2023 (7.1179) grad_norm 3.1771 (2.5505) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:25:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][380/625] eta 0:02:30 lr 0.000123 wd 0.0500 time 0.6067 (0.6137) data time 0.0010 (0.0023) model time 0.6057 (0.6117) loss 7.5609 (7.1109) grad_norm 1.8224 (2.5462) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:25:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][390/625] eta 0:02:24 lr 0.000123 wd 0.0500 time 0.5362 (0.6141) data time 0.0009 (0.0023) model time 0.5353 (0.6122) loss 6.3648 (7.0974) grad_norm 2.0520 (2.5305) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:25:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][400/625] eta 0:02:18 lr 0.000123 wd 0.0500 time 0.6112 (0.6141) data time 0.0008 (0.0022) model time 0.6105 (0.6122) loss 6.8273 (7.0842) grad_norm 1.8651 (2.5275) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:25:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][410/625] eta 0:02:12 lr 0.000123 wd 0.0500 time 0.6046 (0.6141) data time 0.0009 (0.0022) model time 0.6038 (0.6122) loss 6.6515 (7.0799) grad_norm 2.0800 (2.5203) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:25:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][420/625] eta 0:02:05 lr 0.000123 wd 0.0500 time 0.6045 (0.6140) data time 0.0009 (0.0022) model time 0.6037 (0.6122) loss 7.4580 (7.0891) grad_norm 1.8157 (2.5093) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:25:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][430/625] eta 0:01:59 lr 0.000123 wd 0.0500 time 0.6037 (0.6140) data time 0.0008 (0.0021) model time 0.6029 (0.6121) loss 7.6148 (7.0830) grad_norm 2.6835 (2.5122) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:25:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][440/625] eta 0:01:53 lr 0.000123 wd 0.0500 time 0.6019 (0.6139) data time 0.0010 (0.0021) model time 0.6009 (0.6121) loss 7.7050 (7.0788) grad_norm 2.1144 (2.5072) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:25:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][450/625] eta 0:01:47 lr 0.000123 wd 0.0500 time 0.6064 (0.6139) data time 0.0011 (0.0021) model time 0.6053 (0.6120) loss 6.5177 (7.0735) grad_norm 3.1993 (2.5024) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:25:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][460/625] eta 0:01:41 lr 0.000123 wd 0.0500 time 0.6084 (0.6139) data time 0.0008 (0.0021) model time 0.6076 (0.6121) loss 5.9850 (7.0671) grad_norm 2.6202 (2.4960) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:26:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][470/625] eta 0:01:35 lr 0.000123 wd 0.0500 time 0.6075 (0.6139) data time 0.0010 (0.0021) model time 0.6065 (0.6121) loss 6.2174 (7.0683) grad_norm 2.1152 (2.4903) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:26:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][480/625] eta 0:01:29 lr 0.000122 wd 0.0500 time 0.6092 (0.6140) data time 0.0008 (0.0020) model time 0.6085 (0.6122) loss 6.8936 (7.0619) grad_norm 2.6181 (2.4978) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:26:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][490/625] eta 0:01:22 lr 0.000122 wd 0.0500 time 0.6038 (0.6139) data time 0.0007 (0.0020) model time 0.6031 (0.6122) loss 6.8382 (7.0524) grad_norm 3.3120 (2.5239) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:26:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][500/625] eta 0:01:16 lr 0.000122 wd 0.0500 time 0.6054 (0.6139) data time 0.0010 (0.0020) model time 0.6044 (0.6122) loss 7.5719 (7.0527) grad_norm 2.0836 (2.5213) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:26:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][510/625] eta 0:01:10 lr 0.000122 wd 0.0500 time 0.6024 (0.6142) data time 0.0010 (0.0020) model time 0.6014 (0.6125) loss 7.1859 (7.0504) grad_norm 2.2243 (2.5184) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:26:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][520/625] eta 0:01:04 lr 0.000122 wd 0.0500 time 0.6090 (0.6142) data time 0.0008 (0.0020) model time 0.6083 (0.6125) loss 7.2916 (7.0543) grad_norm 2.8892 (2.5117) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 00:26:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][530/625] eta 0:00:58 lr 0.000122 wd 0.0500 time 0.6088 (0.6143) data time 0.0010 (0.0020) model time 0.6078 (0.6127) loss 6.4431 (7.0514) grad_norm 2.1342 (2.5042) loss_scale 256.0000 (129.9284) mem 22339MB +[2024-07-28 00:26:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][540/625] eta 0:00:52 lr 0.000122 wd 0.0500 time 0.6097 (0.6144) data time 0.0010 (0.0019) model time 0.6087 (0.6128) loss 7.6477 (7.0549) grad_norm 2.2167 (2.5000) loss_scale 256.0000 (132.2588) mem 22339MB +[2024-07-28 00:26:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][550/625] eta 0:00:46 lr 0.000122 wd 0.0500 time 0.6094 (0.6144) data time 0.0011 (0.0019) model time 0.6083 (0.6128) loss 6.6810 (7.0558) grad_norm 2.4565 (2.5012) loss_scale 256.0000 (134.5045) mem 22339MB +[2024-07-28 00:27:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][560/625] eta 0:00:39 lr 0.000122 wd 0.0500 time 0.6041 (0.6144) data time 0.0008 (0.0019) model time 0.6033 (0.6128) loss 6.4041 (7.0444) grad_norm 2.1468 (2.4958) loss_scale 256.0000 (136.6702) mem 22339MB +[2024-07-28 00:27:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][570/625] eta 0:00:33 lr 0.000122 wd 0.0500 time 0.6023 (0.6142) data time 0.0011 (0.0019) model time 0.6012 (0.6126) loss 7.1776 (7.0396) grad_norm 3.1653 (2.5018) loss_scale 256.0000 (138.7601) mem 22339MB +[2024-07-28 00:27:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][580/625] eta 0:00:27 lr 0.000122 wd 0.0500 time 0.6019 (0.6142) data time 0.0008 (0.0019) model time 0.6011 (0.6126) loss 6.1637 (7.0403) grad_norm 2.0687 (2.5008) loss_scale 256.0000 (140.7780) mem 22339MB +[2024-07-28 00:27:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][590/625] eta 0:00:21 lr 0.000122 wd 0.0500 time 0.5997 (0.6141) data time 0.0011 (0.0019) model time 0.5986 (0.6125) loss 6.7447 (7.0407) grad_norm 2.3020 (2.4961) loss_scale 256.0000 (142.7276) mem 22339MB +[2024-07-28 00:27:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][600/625] eta 0:00:15 lr 0.000122 wd 0.0500 time 0.6067 (0.6141) data time 0.0008 (0.0018) model time 0.6059 (0.6125) loss 6.8287 (7.0375) grad_norm 2.7383 (2.4902) loss_scale 256.0000 (144.6123) mem 22339MB +[2024-07-28 00:27:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][610/625] eta 0:00:09 lr 0.000122 wd 0.0500 time 0.6083 (0.6141) data time 0.0007 (0.0018) model time 0.6076 (0.6125) loss 7.8795 (7.0442) grad_norm 2.7592 (2.4943) loss_scale 256.0000 (146.4354) mem 22339MB +[2024-07-28 00:27:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [244/300][620/625] eta 0:00:03 lr 0.000122 wd 0.0500 time 0.6087 (0.6143) data time 0.0007 (0.0018) model time 0.6079 (0.6127) loss 8.0305 (7.0402) grad_norm 1.6356 (2.5126) loss_scale 256.0000 (148.1997) mem 22339MB +[2024-07-28 00:27:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 244 training takes 0:06:23 +[2024-07-28 00:27:39 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 00:27:41 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 00:27:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.523 (0.523) Loss 0.4944 (0.4944) Acc@1 90.625 (90.625) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-28 00:27:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.163) Loss 0.7500 (0.6037) Acc@1 83.057 (88.019) Acc@5 96.924 (98.131) Mem 22339MB +[2024-07-28 00:27:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.145) Loss 0.8433 (0.6930) Acc@1 79.980 (85.259) Acc@5 96.143 (97.289) Mem 22339MB +[2024-07-28 00:27:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.909 Acc@5 97.311 +[2024-07-28 00:27:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.9% +[2024-07-28 00:27:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 84.91% +[2024-07-28 00:27:45 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-28 00:27:49 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-28 00:27:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.507 (0.507) Loss 0.5029 (0.5029) Acc@1 90.381 (90.381) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-28 00:27:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.161) Loss 0.7432 (0.6119) Acc@1 83.154 (88.010) Acc@5 96.924 (98.105) Mem 22339MB +[2024-07-28 00:27:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.144) Loss 0.8428 (0.6976) Acc@1 80.225 (85.200) Acc@5 96.045 (97.303) Mem 22339MB +[2024-07-28 00:27:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.825 Acc@5 97.305 +[2024-07-28 00:27:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.8% +[2024-07-28 00:27:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.82% +[2024-07-28 00:27:53 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-28 00:27:54 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-28 00:27:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][0/625] eta 0:09:49 lr 0.000122 wd 0.0500 time 0.9439 (0.9439) data time 0.4111 (0.4111) model time 0.0000 (0.0000) loss 6.9678 (6.9678) grad_norm 1.9164 (1.9164) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 00:28:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][10/625] eta 0:06:33 lr 0.000121 wd 0.0500 time 0.6041 (0.6401) data time 0.0010 (0.0383) model time 0.0000 (0.0000) loss 5.9909 (6.9697) grad_norm 2.1923 (1.9881) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 00:28:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][20/625] eta 0:06:18 lr 0.000121 wd 0.0500 time 0.6034 (0.6254) data time 0.0008 (0.0205) model time 0.0000 (0.0000) loss 6.1382 (6.9552) grad_norm 2.8458 (2.9529) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 00:28:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][30/625] eta 0:06:08 lr 0.000121 wd 0.0500 time 0.6071 (0.6200) data time 0.0010 (0.0142) model time 0.0000 (0.0000) loss 6.7306 (7.0066) grad_norm 2.0074 (2.8614) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 00:28:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][40/625] eta 0:06:01 lr 0.000121 wd 0.0500 time 0.6062 (0.6176) data time 0.0011 (0.0110) model time 0.0000 (0.0000) loss 7.8762 (6.9613) grad_norm 1.7685 (2.8625) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 00:28:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][50/625] eta 0:05:54 lr 0.000121 wd 0.0500 time 0.6108 (0.6170) data time 0.0008 (0.0090) model time 0.0000 (0.0000) loss 6.3490 (7.0001) grad_norm 2.1598 (2.8272) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 00:28:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][60/625] eta 0:05:48 lr 0.000121 wd 0.0500 time 0.6079 (0.6163) data time 0.0010 (0.0077) model time 0.6069 (0.6115) loss 7.4522 (7.0453) grad_norm 2.7116 (2.8059) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 00:28:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][70/625] eta 0:05:41 lr 0.000121 wd 0.0500 time 0.6072 (0.6157) data time 0.0008 (0.0068) model time 0.6064 (0.6111) loss 6.0558 (6.9728) grad_norm 1.8950 (2.7032) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 00:28:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][80/625] eta 0:05:35 lr 0.000121 wd 0.0500 time 0.6082 (0.6150) data time 0.0008 (0.0061) model time 0.6074 (0.6104) loss 8.1330 (7.0435) grad_norm 3.6787 (2.7583) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 00:28:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][90/625] eta 0:05:28 lr 0.000121 wd 0.0500 time 0.6039 (0.6146) data time 0.0010 (0.0055) model time 0.6029 (0.6104) loss 7.7277 (7.0065) grad_norm 2.8816 (2.7118) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 00:28:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][100/625] eta 0:05:22 lr 0.000121 wd 0.0500 time 0.6024 (0.6141) data time 0.0008 (0.0051) model time 0.6016 (0.6100) loss 7.2116 (6.9959) grad_norm 2.6981 (2.6896) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 00:29:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][110/625] eta 0:05:16 lr 0.000121 wd 0.0500 time 0.6040 (0.6155) data time 0.0007 (0.0047) model time 0.6033 (0.6132) loss 5.9871 (7.0002) grad_norm 2.1722 (2.7101) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 00:29:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][120/625] eta 0:05:10 lr 0.000121 wd 0.0500 time 0.6127 (0.6156) data time 0.0010 (0.0044) model time 0.6117 (0.6136) loss 7.1313 (7.0153) grad_norm 2.6901 (2.7171) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 00:29:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][130/625] eta 0:05:04 lr 0.000121 wd 0.0500 time 0.6135 (0.6155) data time 0.0007 (0.0041) model time 0.6127 (0.6134) loss 6.7750 (7.0133) grad_norm 1.9341 (2.6888) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 00:29:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][140/625] eta 0:04:58 lr 0.000121 wd 0.0500 time 0.6062 (0.6152) data time 0.0009 (0.0039) model time 0.6053 (0.6132) loss 6.0442 (6.9863) grad_norm 2.0172 (2.6766) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 00:29:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][150/625] eta 0:04:52 lr 0.000121 wd 0.0500 time 0.6022 (0.6149) data time 0.0007 (0.0037) model time 0.6014 (0.6129) loss 6.7805 (6.9771) grad_norm 11.3729 (2.7290) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 00:29:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][160/625] eta 0:04:45 lr 0.000121 wd 0.0500 time 0.6012 (0.6147) data time 0.0008 (0.0036) model time 0.6004 (0.6126) loss 7.2737 (6.9772) grad_norm 3.6698 (2.7590) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 00:29:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-28 00:29:37 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 00:29:38 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 00:45:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-28 00:45:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-28 00:45:48 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-28 00:45:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-28 00:45:57 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-28 00:45:58 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-28 00:45:58 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-28 00:45:58 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 245) +[2024-07-28 00:45:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-28 00:46:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][170/625] eta 0:21:24 lr 0.000121 wd 0.0500 time 0.5749 (2.8236) data time 0.0006 (0.1666) model time 0.5743 (2.6571) loss 6.7523 (7.4419) grad_norm 2.1770 (2.4545) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 00:46:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][180/625] eta 0:09:01 lr 0.000120 wd 0.0500 time 0.5786 (1.2176) data time 0.0006 (0.0482) model time 0.5781 (1.1694) loss 8.1325 (7.4461) grad_norm 3.3260 (2.4849) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 00:46:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][190/625] eta 0:06:53 lr 0.000120 wd 0.0500 time 0.5741 (0.9495) data time 0.0008 (0.0285) model time 0.5733 (0.9210) loss 7.0175 (7.3441) grad_norm 2.1462 (2.5097) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 00:46:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][200/625] eta 0:05:56 lr 0.000120 wd 0.0500 time 0.5761 (0.8386) data time 0.0007 (0.0204) model time 0.5754 (0.8182) loss 6.5283 (7.3594) grad_norm 1.7944 (2.4355) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 00:46:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][210/625] eta 0:05:22 lr 0.000120 wd 0.0500 time 0.5718 (0.7780) data time 0.0007 (0.0160) model time 0.5711 (0.7620) loss 6.4067 (7.2303) grad_norm 4.0167 (2.4791) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 00:46:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][220/625] eta 0:05:00 lr 0.000120 wd 0.0500 time 0.5790 (0.7425) data time 0.0006 (0.0132) model time 0.5784 (0.7293) loss 8.4123 (7.1888) grad_norm 2.9194 (2.6587) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 00:46:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][230/625] eta 0:04:44 lr 0.000120 wd 0.0500 time 0.5813 (0.7199) data time 0.0007 (0.0112) model time 0.5806 (0.7087) loss 7.1419 (7.1313) grad_norm 1.9547 (2.5727) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 00:46:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][240/625] eta 0:04:29 lr 0.000120 wd 0.0500 time 0.5840 (0.7009) data time 0.0007 (0.0098) model time 0.5834 (0.6910) loss 6.3027 (7.1068) grad_norm 2.6159 (2.6364) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 00:47:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][250/625] eta 0:04:17 lr 0.000120 wd 0.0500 time 0.5820 (0.6864) data time 0.0009 (0.0088) model time 0.5811 (0.6776) loss 7.1996 (7.1012) grad_norm 2.6376 (2.6304) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 00:47:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][260/625] eta 0:04:06 lr 0.000120 wd 0.0500 time 0.5890 (0.6751) data time 0.0008 (0.0080) model time 0.5883 (0.6672) loss 6.4153 (7.0879) grad_norm 2.0853 (2.6187) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 00:47:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][270/625] eta 0:03:56 lr 0.000120 wd 0.0500 time 0.5794 (0.6656) data time 0.0009 (0.0073) model time 0.5785 (0.6583) loss 6.2951 (7.1354) grad_norm 2.3975 (2.6249) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 00:47:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][280/625] eta 0:03:46 lr 0.000120 wd 0.0500 time 0.5819 (0.6577) data time 0.0009 (0.0067) model time 0.5810 (0.6510) loss 7.7719 (7.1382) grad_norm 2.1707 (2.5903) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 00:47:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][290/625] eta 0:03:38 lr 0.000120 wd 0.0500 time 0.5794 (0.6510) data time 0.0008 (0.0063) model time 0.5786 (0.6448) loss 7.0512 (7.1423) grad_norm 2.1053 (2.5664) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 00:47:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][300/625] eta 0:03:29 lr 0.000120 wd 0.0500 time 0.5799 (0.6454) data time 0.0009 (0.0059) model time 0.5790 (0.6396) loss 8.0506 (7.1349) grad_norm 2.0801 (2.6428) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 00:47:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][310/625] eta 0:03:21 lr 0.000120 wd 0.0500 time 0.6026 (0.6410) data time 0.0006 (0.0055) model time 0.6020 (0.6354) loss 6.7048 (7.1066) grad_norm 1.6416 (2.6257) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 00:47:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][320/625] eta 0:03:14 lr 0.000120 wd 0.0500 time 0.5890 (0.6371) data time 0.0006 (0.0052) model time 0.5883 (0.6318) loss 7.0413 (7.1031) grad_norm 3.5174 (2.6338) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 00:47:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][330/625] eta 0:03:06 lr 0.000120 wd 0.0500 time 0.5918 (0.6336) data time 0.0006 (0.0050) model time 0.5912 (0.6287) loss 6.5890 (7.1006) grad_norm 2.0464 (2.6437) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 00:47:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][340/625] eta 0:02:59 lr 0.000119 wd 0.0500 time 0.5785 (0.6304) data time 0.0008 (0.0048) model time 0.5777 (0.6256) loss 5.9459 (7.0941) grad_norm 2.8011 (2.6373) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 00:47:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][350/625] eta 0:02:52 lr 0.000119 wd 0.0500 time 0.5787 (0.6275) data time 0.0006 (0.0045) model time 0.5781 (0.6230) loss 6.8514 (7.0943) grad_norm 2.2005 (2.6366) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 00:48:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][360/625] eta 0:02:45 lr 0.000119 wd 0.0500 time 0.5794 (0.6248) data time 0.0006 (0.0044) model time 0.5788 (0.6204) loss 7.0644 (7.0949) grad_norm 1.6300 (2.6332) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 00:48:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][370/625] eta 0:02:38 lr 0.000119 wd 0.0500 time 0.5794 (0.6224) data time 0.0008 (0.0042) model time 0.5786 (0.6182) loss 6.8881 (7.0774) grad_norm 4.2790 (2.6312) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 00:48:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][380/625] eta 0:02:31 lr 0.000119 wd 0.0500 time 0.5851 (0.6203) data time 0.0006 (0.0041) model time 0.5845 (0.6162) loss 7.1634 (7.0638) grad_norm 2.0755 (2.6200) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 00:48:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][390/625] eta 0:02:25 lr 0.000119 wd 0.0500 time 0.5841 (0.6185) data time 0.0009 (0.0039) model time 0.5832 (0.6146) loss 8.1080 (7.0603) grad_norm 2.6206 (2.6035) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 00:48:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][400/625] eta 0:02:18 lr 0.000119 wd 0.0500 time 0.5839 (0.6168) data time 0.0007 (0.0038) model time 0.5832 (0.6130) loss 5.8499 (7.0393) grad_norm 1.6205 (2.5844) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 00:48:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][410/625] eta 0:02:12 lr 0.000119 wd 0.0500 time 0.5799 (0.6152) data time 0.0006 (0.0037) model time 0.5793 (0.6116) loss 6.0843 (7.0423) grad_norm 2.1050 (2.5658) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 00:48:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][420/625] eta 0:02:05 lr 0.000119 wd 0.0500 time 0.5853 (0.6137) data time 0.0006 (0.0036) model time 0.5847 (0.6102) loss 5.5227 (7.0240) grad_norm 2.5181 (2.5634) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 00:48:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][430/625] eta 0:01:59 lr 0.000119 wd 0.0500 time 0.5852 (0.6124) data time 0.0006 (0.0035) model time 0.5846 (0.6089) loss 6.7972 (7.0132) grad_norm 1.7876 (2.5648) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 00:48:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-28 00:48:45 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 00:48:49 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 00:55:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-28 00:55:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-28 01:02:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-28 01:02:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-28 01:04:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-28 01:04:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-28 01:13:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-28 01:13:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-28 01:13:51 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-28 01:14:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-28 01:14:08 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-28 01:14:09 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-28 01:14:09 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-28 01:14:09 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 245) +[2024-07-28 01:14:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-28 01:14:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][440/625] eta 0:05:20 lr 0.000119 wd 0.0500 time 0.6127 (1.7336) data time 0.0011 (0.0687) model time 0.6116 (1.6649) loss 7.7446 (7.3160) grad_norm 5.5338 (2.9084) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-28 01:14:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][450/625] eta 0:03:23 lr 0.000119 wd 0.0500 time 0.6364 (1.1639) data time 0.0009 (0.0349) model time 0.6355 (1.1290) loss 7.9487 (7.2182) grad_norm 1.9628 (2.9779) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-28 01:14:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][460/625] eta 0:02:40 lr 0.000119 wd 0.0500 time 0.5939 (0.9712) data time 0.0010 (0.0240) model time 0.5929 (0.9472) loss 9.2866 (7.3623) grad_norm 2.1364 (2.8659) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-28 01:14:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][470/625] eta 0:02:15 lr 0.000119 wd 0.0500 time 0.6116 (0.8753) data time 0.0008 (0.0182) model time 0.6108 (0.8571) loss 5.7856 (7.2625) grad_norm 3.5259 (2.8106) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-28 01:14:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][480/625] eta 0:01:58 lr 0.000119 wd 0.0500 time 0.5896 (0.8186) data time 0.0012 (0.0148) model time 0.5885 (0.8038) loss 7.0018 (7.2326) grad_norm 1.9203 (2.7591) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-28 01:15:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][490/625] eta 0:01:46 lr 0.000119 wd 0.0500 time 0.5940 (0.7869) data time 0.0008 (0.0126) model time 0.5932 (0.7742) loss 6.3886 (7.1471) grad_norm 3.2775 (2.7770) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-28 01:15:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][500/625] eta 0:01:34 lr 0.000118 wd 0.0500 time 0.5952 (0.7589) data time 0.0008 (0.0110) model time 0.5944 (0.7480) loss 6.7660 (7.1321) grad_norm 2.1765 (2.7158) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-28 01:15:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][510/625] eta 0:01:25 lr 0.000118 wd 0.0500 time 0.6206 (0.7393) data time 0.0010 (0.0097) model time 0.6196 (0.7296) loss 7.6408 (7.1363) grad_norm 1.9520 (2.7879) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-28 01:15:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][520/625] eta 0:01:16 lr 0.000118 wd 0.0500 time 0.6410 (0.7241) data time 0.0008 (0.0088) model time 0.6402 (0.7154) loss 6.7507 (7.1119) grad_norm 1.7938 (2.8356) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-28 01:15:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][530/625] eta 0:01:07 lr 0.000118 wd 0.0500 time 0.5935 (0.7110) data time 0.0010 (0.0081) model time 0.5924 (0.7030) loss 8.7437 (7.1502) grad_norm 2.3292 (2.8241) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-28 01:15:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][540/625] eta 0:00:59 lr 0.000118 wd 0.0500 time 0.5882 (0.7002) data time 0.0011 (0.0075) model time 0.5870 (0.6927) loss 7.0498 (7.1396) grad_norm 2.5945 (2.7781) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-28 01:15:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][550/625] eta 0:00:51 lr 0.000118 wd 0.0500 time 0.5954 (0.6910) data time 0.0008 (0.0069) model time 0.5946 (0.6840) loss 6.7852 (7.1532) grad_norm 4.4562 (2.7505) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-28 01:15:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][560/625] eta 0:00:44 lr 0.000118 wd 0.0500 time 0.5880 (0.6829) data time 0.0009 (0.0065) model time 0.5872 (0.6764) loss 7.6495 (7.1325) grad_norm 2.7984 (2.7286) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-28 01:15:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][570/625] eta 0:00:37 lr 0.000118 wd 0.0500 time 0.6149 (0.6767) data time 0.0008 (0.0061) model time 0.6141 (0.6706) loss 5.9892 (7.1024) grad_norm 3.6870 (2.6973) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-28 01:15:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][580/625] eta 0:00:30 lr 0.000118 wd 0.0500 time 0.6172 (0.6715) data time 0.0012 (0.0058) model time 0.6161 (0.6658) loss 7.9859 (7.1064) grad_norm 2.8669 (2.6770) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-28 01:16:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][590/625] eta 0:00:23 lr 0.000118 wd 0.0500 time 0.6634 (0.6672) data time 0.0010 (0.0055) model time 0.6624 (0.6617) loss 7.7425 (7.1044) grad_norm 2.3299 (2.7010) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-28 01:16:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][600/625] eta 0:00:16 lr 0.000118 wd 0.0500 time 0.5870 (0.6628) data time 0.0009 (0.0052) model time 0.5862 (0.6576) loss 6.0705 (7.1162) grad_norm 2.4737 (2.7064) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-28 01:16:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][610/625] eta 0:00:09 lr 0.000118 wd 0.0500 time 0.5938 (0.6588) data time 0.0005 (0.0050) model time 0.5933 (0.6538) loss 6.4309 (7.0952) grad_norm 1.8499 (2.6822) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-28 01:16:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [245/300][620/625] eta 0:00:03 lr 0.000118 wd 0.0500 time 0.5908 (0.6550) data time 0.0005 (0.0048) model time 0.5903 (0.6503) loss 6.4554 (7.0960) grad_norm 1.8880 (2.6878) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-28 01:16:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 245 training takes 0:02:06 +[2024-07-28 01:16:20 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 01:16:27 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 01:16:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.526 (0.526) Loss 0.4907 (0.4907) Acc@1 90.332 (90.332) Acc@5 98.926 (98.926) Mem 22344MB +[2024-07-28 01:16:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.163) Loss 0.7466 (0.6026) Acc@1 83.008 (87.886) Acc@5 96.924 (98.131) Mem 22344MB +[2024-07-28 01:16:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.145) Loss 0.8232 (0.6914) Acc@1 80.566 (85.249) Acc@5 96.094 (97.296) Mem 22344MB +[2024-07-28 01:16:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.895 Acc@5 97.309 +[2024-07-28 01:16:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.9% +[2024-07-28 01:16:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.952 (0.952) Loss 0.5029 (0.5029) Acc@1 90.381 (90.381) Acc@5 99.023 (99.023) Mem 22344MB +[2024-07-28 01:16:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.201) Loss 0.7437 (0.6116) Acc@1 83.105 (88.010) Acc@5 96.924 (98.113) Mem 22344MB +[2024-07-28 01:16:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.165) Loss 0.8428 (0.6971) Acc@1 80.273 (85.205) Acc@5 95.996 (97.310) Mem 22344MB +[2024-07-28 01:16:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.825 Acc@5 97.311 +[2024-07-28 01:16:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.8% +[2024-07-28 01:16:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.82% +[2024-07-28 01:16:37 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-28 01:16:42 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-28 01:16:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][0/625] eta 0:12:20 lr 0.000118 wd 0.0500 time 1.1854 (1.1854) data time 0.4678 (0.4678) model time 0.0000 (0.0000) loss 6.0111 (6.0111) grad_norm 2.4682 (2.4682) loss_scale 256.0000 (256.0000) mem 22337MB +[2024-07-28 01:16:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][10/625] eta 0:06:36 lr 0.000118 wd 0.0500 time 0.5915 (0.6448) data time 0.0011 (0.0437) model time 0.0000 (0.0000) loss 6.0863 (6.6182) grad_norm 2.2464 (3.3726) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 01:16:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][20/625] eta 0:06:19 lr 0.000118 wd 0.0500 time 0.5896 (0.6273) data time 0.0012 (0.0234) model time 0.0000 (0.0000) loss 6.5567 (6.6472) grad_norm 2.7571 (2.9695) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 01:17:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][30/625] eta 0:06:05 lr 0.000118 wd 0.0500 time 0.5856 (0.6151) data time 0.0010 (0.0162) model time 0.0000 (0.0000) loss 7.2791 (6.7592) grad_norm 2.2587 (3.2079) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 01:17:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][40/625] eta 0:05:55 lr 0.000117 wd 0.0500 time 0.5920 (0.6081) data time 0.0010 (0.0125) model time 0.0000 (0.0000) loss 7.6086 (6.7498) grad_norm 1.8326 (3.0563) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 01:17:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][50/625] eta 0:05:47 lr 0.000117 wd 0.0500 time 0.5890 (0.6040) data time 0.0011 (0.0103) model time 0.0000 (0.0000) loss 6.1205 (6.7520) grad_norm 2.4211 (2.8499) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 01:17:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][60/625] eta 0:05:40 lr 0.000117 wd 0.0500 time 0.5982 (0.6023) data time 0.0008 (0.0087) model time 0.5974 (0.5923) loss 6.7514 (6.7444) grad_norm 2.8548 (2.7506) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 01:17:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][70/625] eta 0:05:33 lr 0.000117 wd 0.0500 time 0.5973 (0.6006) data time 0.0008 (0.0077) model time 0.5965 (0.5908) loss 6.1784 (6.7254) grad_norm 2.6561 (2.7047) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 01:17:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-28 01:17:30 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 01:17:32 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 01:20:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-28 01:20:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-28 01:20:33 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-28 01:20:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-28 01:20:47 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-28 01:20:47 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-28 01:20:47 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-28 01:20:47 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 246) +[2024-07-28 01:20:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-28 01:21:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][80/625] eta 1:35:51 lr 0.000117 wd 0.0500 time 10.5533 (10.5533) data time 0.7270 (0.7270) model time 9.8263 (9.8263) loss 7.9948 (7.9948) grad_norm 1.9934 (1.9934) loss_scale 256.0000 (256.0000) mem 26016MB +[2024-07-28 01:21:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][90/625] eta 0:15:40 lr 0.000117 wd 0.0500 time 0.5820 (1.7584) data time 0.0012 (0.0672) model time 0.5808 (1.6912) loss 6.4582 (7.5535) grad_norm 2.8307 (2.5800) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:21:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][100/625] eta 0:10:32 lr 0.000117 wd 0.0500 time 0.5854 (1.2043) data time 0.0011 (0.0357) model time 0.5843 (1.1686) loss 6.5573 (7.4746) grad_norm 2.2402 (2.4861) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:21:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][110/625] eta 0:08:37 lr 0.000117 wd 0.0500 time 0.5839 (1.0050) data time 0.0009 (0.0246) model time 0.5830 (0.9804) loss 6.2920 (7.5349) grad_norm 1.9265 (2.5417) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:21:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][120/625] eta 0:07:37 lr 0.000117 wd 0.0500 time 0.5821 (0.9054) data time 0.0012 (0.0189) model time 0.5809 (0.8865) loss 7.1553 (7.4305) grad_norm 1.6500 (2.5624) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:21:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][130/625] eta 0:07:00 lr 0.000117 wd 0.0500 time 0.8125 (0.8489) data time 0.0009 (0.0155) model time 0.8116 (0.8335) loss 6.7626 (7.3685) grad_norm 2.3057 (2.5391) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:21:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][140/625] eta 0:06:32 lr 0.000117 wd 0.0500 time 0.5831 (0.8093) data time 0.0013 (0.0132) model time 0.5818 (0.7961) loss 7.5859 (7.2933) grad_norm 1.6501 (2.5933) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:21:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][150/625] eta 0:06:09 lr 0.000117 wd 0.0500 time 0.5912 (0.7788) data time 0.0011 (0.0115) model time 0.5901 (0.7673) loss 6.4086 (7.2417) grad_norm 2.6234 (2.5526) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:21:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][160/625] eta 0:05:51 lr 0.000117 wd 0.0500 time 0.5909 (0.7560) data time 0.0010 (0.0102) model time 0.5899 (0.7458) loss 6.6995 (7.2167) grad_norm 1.9152 (2.5668) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:21:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][170/625] eta 0:05:36 lr 0.000117 wd 0.0500 time 0.5868 (0.7407) data time 0.0009 (0.0092) model time 0.5859 (0.7314) loss 7.6073 (7.1666) grad_norm 2.0296 (2.5461) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:22:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][180/625] eta 0:05:23 lr 0.000117 wd 0.0500 time 0.5833 (0.7267) data time 0.0008 (0.0084) model time 0.5825 (0.7183) loss 6.9318 (7.1797) grad_norm 2.2747 (2.6148) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:22:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-28 01:22:09 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 01:22:18 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 01:45:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-28 01:45:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-28 01:45:51 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-28 01:46:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-28 01:46:05 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-28 01:46:05 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-28 01:46:05 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-28 01:46:06 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 246) +[2024-07-28 01:46:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-28 01:46:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][190/625] eta 0:25:36 lr 0.000117 wd 0.0500 time 0.6079 (3.5324) data time 0.0008 (0.1550) model time 0.6071 (3.3773) loss 7.9654 (7.6445) grad_norm 3.1806 (2.8485) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:46:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][200/625] eta 0:10:13 lr 0.000117 wd 0.0500 time 0.6121 (1.4429) data time 0.0008 (0.0451) model time 0.6113 (1.3978) loss 6.6513 (7.3298) grad_norm 2.6794 (3.3883) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:46:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][210/625] eta 0:07:34 lr 0.000116 wd 0.0500 time 0.6010 (1.0941) data time 0.0011 (0.0268) model time 0.5999 (1.0673) loss 7.1288 (7.3883) grad_norm 2.0502 (2.9296) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:46:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][220/625] eta 0:06:24 lr 0.000116 wd 0.0500 time 0.5986 (0.9495) data time 0.0008 (0.0192) model time 0.5978 (0.9303) loss 5.8123 (7.3283) grad_norm 2.6622 (2.8192) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:46:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][230/625] eta 0:05:43 lr 0.000116 wd 0.0500 time 0.6084 (0.8706) data time 0.0009 (0.0151) model time 0.6076 (0.8554) loss 6.1067 (7.2683) grad_norm 1.8329 (2.7589) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:46:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][240/625] eta 0:05:17 lr 0.000116 wd 0.0500 time 0.5996 (0.8242) data time 0.0008 (0.0125) model time 0.5989 (0.8116) loss 7.4687 (7.2362) grad_norm 2.5539 (2.7170) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:47:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][250/625] eta 0:04:57 lr 0.000116 wd 0.0500 time 0.6141 (0.7936) data time 0.0009 (0.0107) model time 0.6133 (0.7829) loss 7.6426 (7.1749) grad_norm 1.9180 (2.6897) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:47:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][260/625] eta 0:04:40 lr 0.000116 wd 0.0500 time 0.6159 (0.7691) data time 0.0008 (0.0094) model time 0.6151 (0.7596) loss 7.1007 (7.1433) grad_norm 3.5993 (2.6858) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:47:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][270/625] eta 0:04:26 lr 0.000116 wd 0.0500 time 0.6147 (0.7502) data time 0.0011 (0.0084) model time 0.6136 (0.7417) loss 7.4198 (7.1307) grad_norm 1.6074 (2.6844) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:47:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][280/625] eta 0:04:13 lr 0.000116 wd 0.0500 time 0.6162 (0.7353) data time 0.0010 (0.0076) model time 0.6152 (0.7277) loss 6.5774 (7.1167) grad_norm 3.5320 (2.6533) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:47:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][290/625] eta 0:04:02 lr 0.000116 wd 0.0500 time 0.6046 (0.7227) data time 0.0010 (0.0070) model time 0.6035 (0.7157) loss 7.1456 (7.1408) grad_norm 1.4727 (2.6074) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:47:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][300/625] eta 0:03:51 lr 0.000116 wd 0.0500 time 0.6095 (0.7123) data time 0.0011 (0.0065) model time 0.6084 (0.7058) loss 6.9125 (7.1273) grad_norm 2.5979 (2.5897) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:47:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][310/625] eta 0:03:41 lr 0.000116 wd 0.0500 time 0.6104 (0.7039) data time 0.0010 (0.0060) model time 0.6094 (0.6978) loss 6.2616 (7.1257) grad_norm 1.8682 (2.5739) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:47:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][320/625] eta 0:03:32 lr 0.000116 wd 0.0500 time 0.6117 (0.6967) data time 0.0010 (0.0057) model time 0.6107 (0.6909) loss 9.0313 (7.1262) grad_norm 1.8338 (2.5607) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:47:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-28 01:47:49 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 01:47:54 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 01:56:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-28 01:56:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-28 01:56:33 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-28 01:56:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-28 01:56:48 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-28 01:56:48 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-28 01:56:48 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-28 01:56:48 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 246) +[2024-07-28 01:56:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-28 01:57:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][330/625] eta 0:56:28 lr 0.000116 wd 0.0500 time 11.4868 (11.4868) data time 0.6364 (0.6364) model time 10.8503 (10.8503) loss 7.7576 (7.7576) grad_norm 2.3757 (2.3757) loss_scale 256.0000 (256.0000) mem 26016MB +[2024-07-28 01:57:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][340/625] eta 0:08:30 lr 0.000116 wd 0.0500 time 0.6059 (1.7917) data time 0.0011 (0.0590) model time 0.6048 (1.7327) loss 6.3999 (7.5507) grad_norm 2.5191 (2.9031) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:57:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][350/625] eta 0:05:38 lr 0.000116 wd 0.0500 time 0.6061 (1.2311) data time 0.0011 (0.0318) model time 0.6050 (1.1994) loss 6.5514 (7.3947) grad_norm 5.2172 (2.8724) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:57:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][360/625] eta 0:04:32 lr 0.000116 wd 0.0500 time 0.5944 (1.0296) data time 0.0008 (0.0219) model time 0.5936 (1.0077) loss 5.8666 (7.3637) grad_norm 2.3155 (3.0581) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:57:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][370/625] eta 0:03:56 lr 0.000115 wd 0.0500 time 0.5944 (0.9279) data time 0.0011 (0.0174) model time 0.5933 (0.9104) loss 7.1325 (7.2674) grad_norm 2.3109 (2.9650) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:57:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][380/625] eta 0:03:32 lr 0.000115 wd 0.0500 time 0.8083 (0.8687) data time 0.0008 (0.0143) model time 0.8075 (0.8544) loss 6.9389 (7.2553) grad_norm 3.6159 (2.8483) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:57:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][390/625] eta 0:03:14 lr 0.000115 wd 0.0500 time 0.6064 (0.8283) data time 0.0010 (0.0121) model time 0.6054 (0.8162) loss 6.8005 (7.1638) grad_norm 2.7631 (2.9273) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:57:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][400/625] eta 0:02:59 lr 0.000115 wd 0.0500 time 0.6061 (0.7978) data time 0.0011 (0.0106) model time 0.6050 (0.7872) loss 6.8070 (7.1301) grad_norm 2.2169 (2.8339) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:57:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][410/625] eta 0:02:46 lr 0.000115 wd 0.0500 time 0.6038 (0.7751) data time 0.0011 (0.0094) model time 0.6027 (0.7656) loss 6.1727 (7.1032) grad_norm 2.0421 (2.7874) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:58:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][420/625] eta 0:02:35 lr 0.000115 wd 0.0500 time 0.6123 (0.7573) data time 0.0009 (0.0086) model time 0.6114 (0.7487) loss 8.2745 (7.1028) grad_norm 1.5393 (2.7553) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:58:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][430/625] eta 0:02:24 lr 0.000115 wd 0.0500 time 0.6041 (0.7424) data time 0.0007 (0.0078) model time 0.6033 (0.7345) loss 7.0041 (7.1177) grad_norm 1.8227 (2.8178) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:58:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][440/625] eta 0:02:15 lr 0.000115 wd 0.0500 time 0.6023 (0.7301) data time 0.0010 (0.0072) model time 0.6013 (0.7229) loss 7.2706 (7.1271) grad_norm 2.7091 (2.7717) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:58:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][450/625] eta 0:02:05 lr 0.000115 wd 0.0500 time 0.5982 (0.7199) data time 0.0008 (0.0067) model time 0.5974 (0.7132) loss 6.2496 (7.1309) grad_norm 2.0124 (2.7240) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:58:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][460/625] eta 0:01:57 lr 0.000115 wd 0.0500 time 0.6012 (0.7111) data time 0.0010 (0.0063) model time 0.6002 (0.7048) loss 7.8449 (7.1094) grad_norm 1.6966 (2.6874) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:58:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][470/625] eta 0:01:49 lr 0.000115 wd 0.0500 time 0.6082 (0.7041) data time 0.0008 (0.0060) model time 0.6074 (0.6981) loss 7.2269 (7.0910) grad_norm 2.9819 (2.7333) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:58:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][480/625] eta 0:01:41 lr 0.000115 wd 0.0500 time 0.6063 (0.6982) data time 0.0011 (0.0057) model time 0.6052 (0.6925) loss 6.2811 (7.0678) grad_norm 2.7519 (2.7138) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:58:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][490/625] eta 0:01:33 lr 0.000115 wd 0.0500 time 0.6089 (0.6930) data time 0.0010 (0.0054) model time 0.6078 (0.6875) loss 8.1265 (7.0728) grad_norm 1.8121 (2.7021) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:58:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][500/625] eta 0:01:26 lr 0.000115 wd 0.0500 time 0.6024 (0.6881) data time 0.0010 (0.0052) model time 0.6013 (0.6830) loss 6.8445 (7.0799) grad_norm 6.3597 (2.7274) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:58:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][510/625] eta 0:01:18 lr 0.000115 wd 0.0500 time 0.5989 (0.6836) data time 0.0010 (0.0049) model time 0.5980 (0.6787) loss 7.6959 (7.0588) grad_norm 3.2728 (2.7118) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:59:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][520/625] eta 0:01:11 lr 0.000115 wd 0.0500 time 0.5916 (0.6795) data time 0.0011 (0.0047) model time 0.5905 (0.6747) loss 5.8766 (7.0528) grad_norm 1.8472 (2.6747) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:59:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][530/625] eta 0:01:04 lr 0.000115 wd 0.0500 time 0.5979 (0.6758) data time 0.0012 (0.0046) model time 0.5967 (0.6712) loss 7.5770 (7.0433) grad_norm 2.0416 (2.6518) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:59:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][540/625] eta 0:00:57 lr 0.000114 wd 0.0500 time 0.6059 (0.6727) data time 0.0010 (0.0044) model time 0.6049 (0.6683) loss 7.3356 (7.0361) grad_norm 2.7168 (2.6268) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:59:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][550/625] eta 0:00:50 lr 0.000114 wd 0.0500 time 0.6060 (0.6700) data time 0.0008 (0.0043) model time 0.6051 (0.6658) loss 7.0074 (7.0181) grad_norm 2.3799 (2.6119) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:59:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][560/625] eta 0:00:43 lr 0.000114 wd 0.0500 time 0.6072 (0.6677) data time 0.0008 (0.0041) model time 0.6064 (0.6636) loss 5.3505 (7.0144) grad_norm 2.1154 (2.6395) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:59:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][570/625] eta 0:00:36 lr 0.000114 wd 0.0500 time 0.6059 (0.6655) data time 0.0008 (0.0040) model time 0.6051 (0.6615) loss 7.2437 (7.0206) grad_norm 3.8724 (2.6356) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:59:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][580/625] eta 0:00:29 lr 0.000114 wd 0.0500 time 0.6032 (0.6631) data time 0.0008 (0.0039) model time 0.6025 (0.6592) loss 7.0793 (7.0111) grad_norm 2.0692 (2.6305) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:59:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][590/625] eta 0:00:23 lr 0.000114 wd 0.0500 time 0.5983 (0.6609) data time 0.0008 (0.0038) model time 0.5975 (0.6571) loss 6.5057 (7.0076) grad_norm 1.9803 (2.6484) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:59:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][600/625] eta 0:00:16 lr 0.000114 wd 0.0500 time 0.6034 (0.6589) data time 0.0008 (0.0037) model time 0.6026 (0.6552) loss 6.8789 (6.9979) grad_norm 2.6040 (2.6593) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 01:59:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][610/625] eta 0:00:09 lr 0.000114 wd 0.0500 time 0.6054 (0.6583) data time 0.0008 (0.0036) model time 0.6046 (0.6547) loss 7.3514 (7.0025) grad_norm 3.1369 (2.6921) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:00:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [246/300][620/625] eta 0:00:03 lr 0.000114 wd 0.0500 time 0.6075 (0.6568) data time 0.0005 (0.0035) model time 0.6070 (0.6533) loss 5.1546 (6.9962) grad_norm 1.7207 (2.6787) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:00:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 246 training takes 0:03:13 +[2024-07-28 02:00:06 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 02:00:14 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 02:00:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.513 (0.513) Loss 0.4893 (0.4893) Acc@1 90.186 (90.186) Acc@5 98.975 (98.975) Mem 22341MB +[2024-07-28 02:00:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.162) Loss 0.7495 (0.5983) Acc@1 82.617 (87.966) Acc@5 97.168 (98.158) Mem 22341MB +[2024-07-28 02:00:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.145) Loss 0.8286 (0.6862) Acc@1 80.811 (85.338) Acc@5 96.045 (97.326) Mem 22341MB +[2024-07-28 02:00:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.905 Acc@5 97.325 +[2024-07-28 02:00:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 84.9% +[2024-07-28 02:00:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.879 (0.879) Loss 0.5020 (0.5020) Acc@1 90.479 (90.479) Acc@5 99.023 (99.023) Mem 22341MB +[2024-07-28 02:00:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.195) Loss 0.7427 (0.6110) Acc@1 83.105 (88.033) Acc@5 96.924 (98.113) Mem 22341MB +[2024-07-28 02:00:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.162) Loss 0.8418 (0.6966) Acc@1 80.322 (85.240) Acc@5 95.996 (97.312) Mem 22341MB +[2024-07-28 02:00:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.853 Acc@5 97.311 +[2024-07-28 02:00:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.9% +[2024-07-28 02:00:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.85% +[2024-07-28 02:00:24 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-28 02:00:30 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-28 02:00:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][0/625] eta 0:11:55 lr 0.000114 wd 0.0500 time 1.1442 (1.1442) data time 0.4525 (0.4525) model time 0.0000 (0.0000) loss 6.6099 (6.6099) grad_norm 1.9317 (1.9317) loss_scale 256.0000 (256.0000) mem 22337MB +[2024-07-28 02:00:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][10/625] eta 0:06:43 lr 0.000114 wd 0.0500 time 0.6084 (0.6567) data time 0.0009 (0.0422) model time 0.0000 (0.0000) loss 6.0251 (6.8585) grad_norm 2.5321 (2.4081) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 02:00:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][20/625] eta 0:06:23 lr 0.000114 wd 0.0500 time 0.6113 (0.6338) data time 0.0010 (0.0226) model time 0.0000 (0.0000) loss 7.5680 (7.2189) grad_norm 3.7911 (3.6349) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 02:00:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][30/625] eta 0:06:11 lr 0.000114 wd 0.0500 time 0.6143 (0.6251) data time 0.0011 (0.0157) model time 0.0000 (0.0000) loss 8.0321 (7.1470) grad_norm 1.6823 (3.2462) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 02:00:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][40/625] eta 0:06:03 lr 0.000114 wd 0.0500 time 0.6061 (0.6219) data time 0.0011 (0.0121) model time 0.0000 (0.0000) loss 6.5586 (7.0528) grad_norm 2.8304 (2.9903) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 02:01:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][50/625] eta 0:05:56 lr 0.000114 wd 0.0500 time 0.6077 (0.6192) data time 0.0010 (0.0100) model time 0.0000 (0.0000) loss 8.7233 (7.1349) grad_norm 2.2476 (2.8678) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 02:01:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][60/625] eta 0:05:49 lr 0.000114 wd 0.0500 time 0.6185 (0.6178) data time 0.0008 (0.0085) model time 0.6177 (0.6095) loss 5.7558 (7.0715) grad_norm 2.9770 (2.8293) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 02:01:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][70/625] eta 0:05:42 lr 0.000114 wd 0.0500 time 0.6170 (0.6172) data time 0.0012 (0.0075) model time 0.6158 (0.6108) loss 8.8336 (7.1030) grad_norm 1.5992 (2.7483) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 02:01:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][80/625] eta 0:05:35 lr 0.000113 wd 0.0500 time 0.6160 (0.6164) data time 0.0009 (0.0067) model time 0.6151 (0.6104) loss 6.4262 (7.0847) grad_norm 3.0039 (2.7988) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 02:01:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][90/625] eta 0:05:29 lr 0.000113 wd 0.0500 time 0.6216 (0.6158) data time 0.0013 (0.0061) model time 0.6203 (0.6104) loss 7.7586 (7.0535) grad_norm 3.1281 (2.7844) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 02:01:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][100/625] eta 0:05:22 lr 0.000113 wd 0.0500 time 0.6122 (0.6149) data time 0.0008 (0.0056) model time 0.6114 (0.6095) loss 7.2126 (7.0400) grad_norm 2.3286 (2.7789) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 02:01:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][110/625] eta 0:05:16 lr 0.000113 wd 0.0500 time 0.6372 (0.6143) data time 0.0008 (0.0052) model time 0.6363 (0.6091) loss 7.0365 (7.0499) grad_norm 2.6454 (2.8980) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 02:01:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][120/625] eta 0:05:09 lr 0.000113 wd 0.0500 time 0.6080 (0.6138) data time 0.0008 (0.0049) model time 0.6072 (0.6086) loss 6.6842 (7.0367) grad_norm 2.3457 (2.8456) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 02:01:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-28 02:01:49 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 02:01:50 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 02:04:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-28 02:04:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-28 02:07:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-28 02:07:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-28 02:07:59 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-28 02:08:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-28 02:08:13 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-28 02:08:13 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-28 02:08:13 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-28 02:08:14 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 247) +[2024-07-28 02:08:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-28 02:08:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][130/625] eta 0:40:12 lr 0.000113 wd 0.0500 time 0.5493 (4.8743) data time 0.0010 (0.3168) model time 0.5483 (4.5576) loss 6.3624 (7.0194) grad_norm 1.6923 (1.9346) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:08:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][140/625] eta 0:12:22 lr 0.000113 wd 0.0500 time 0.5228 (1.5308) data time 0.0014 (0.0739) model time 0.5214 (1.4568) loss 8.1130 (7.1071) grad_norm 2.5581 (2.2154) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:08:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][150/625] eta 0:08:40 lr 0.000113 wd 0.0500 time 0.5169 (1.0967) data time 0.0008 (0.0423) model time 0.5162 (1.0544) loss 7.4850 (7.1310) grad_norm 2.4919 (2.2748) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:08:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][160/625] eta 0:07:12 lr 0.000113 wd 0.0500 time 0.5230 (0.9309) data time 0.0013 (0.0301) model time 0.5217 (0.9008) loss 8.1085 (7.2829) grad_norm 7.7673 (2.4128) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:08:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][170/625] eta 0:06:21 lr 0.000113 wd 0.0500 time 0.5229 (0.8384) data time 0.0011 (0.0233) model time 0.5218 (0.8151) loss 6.3499 (7.2250) grad_norm 2.3854 (2.4371) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:09:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][180/625] eta 0:05:49 lr 0.000113 wd 0.0500 time 0.5159 (0.7862) data time 0.0012 (0.0191) model time 0.5147 (0.7671) loss 6.8363 (7.1927) grad_norm 3.4349 (3.0645) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:09:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][190/625] eta 0:05:26 lr 0.000113 wd 0.0500 time 0.5512 (0.7503) data time 0.0013 (0.0167) model time 0.5499 (0.7337) loss 6.5521 (7.1588) grad_norm 2.3507 (3.1564) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:09:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][200/625] eta 0:05:06 lr 0.000113 wd 0.0500 time 0.5183 (0.7205) data time 0.0009 (0.0146) model time 0.5173 (0.7059) loss 7.9603 (7.1379) grad_norm 1.8082 (3.0756) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:09:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][210/625] eta 0:04:49 lr 0.000113 wd 0.0500 time 0.5176 (0.6982) data time 0.0007 (0.0130) model time 0.5169 (0.6852) loss 5.8887 (7.0879) grad_norm 3.2572 (3.0204) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:09:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][220/625] eta 0:04:35 lr 0.000113 wd 0.0500 time 0.5152 (0.6811) data time 0.0008 (0.0117) model time 0.5144 (0.6694) loss 8.1879 (7.0831) grad_norm 1.7683 (2.9294) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:09:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][230/625] eta 0:04:23 lr 0.000113 wd 0.0500 time 0.5169 (0.6666) data time 0.0007 (0.0108) model time 0.5161 (0.6558) loss 7.1332 (7.0996) grad_norm 2.0316 (2.8906) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:09:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][240/625] eta 0:04:12 lr 0.000113 wd 0.0500 time 0.5218 (0.6552) data time 0.0007 (0.0099) model time 0.5211 (0.6453) loss 6.0920 (7.0864) grad_norm 1.7846 (2.8324) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:09:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][250/625] eta 0:04:01 lr 0.000112 wd 0.0500 time 0.5229 (0.6445) data time 0.0010 (0.0093) model time 0.5219 (0.6352) loss 6.6805 (7.0938) grad_norm 1.9199 (2.8119) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:09:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][260/625] eta 0:03:52 lr 0.000112 wd 0.0500 time 0.5176 (0.6357) data time 0.0010 (0.0087) model time 0.5167 (0.6271) loss 7.3839 (7.0872) grad_norm 1.9541 (2.7725) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:09:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][270/625] eta 0:03:43 lr 0.000112 wd 0.0500 time 0.5374 (0.6291) data time 0.0014 (0.0082) model time 0.5360 (0.6209) loss 8.5015 (7.0691) grad_norm 2.4407 (2.7399) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:09:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][280/625] eta 0:03:34 lr 0.000112 wd 0.0500 time 0.5171 (0.6223) data time 0.0007 (0.0077) model time 0.5164 (0.6146) loss 6.8594 (7.0573) grad_norm 2.2110 (2.7123) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:09:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][290/625] eta 0:03:26 lr 0.000112 wd 0.0500 time 0.5196 (0.6164) data time 0.0010 (0.0073) model time 0.5187 (0.6090) loss 6.8383 (7.0548) grad_norm 2.5086 (2.7032) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:10:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][300/625] eta 0:03:18 lr 0.000112 wd 0.0500 time 0.5273 (0.6121) data time 0.0014 (0.0070) model time 0.5259 (0.6050) loss 8.3472 (7.0545) grad_norm 5.9113 (2.6875) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:10:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][310/625] eta 0:03:11 lr 0.000112 wd 0.0500 time 0.5165 (0.6077) data time 0.0010 (0.0067) model time 0.5155 (0.6010) loss 7.9694 (7.0424) grad_norm 1.7850 (2.6722) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:10:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][320/625] eta 0:03:04 lr 0.000112 wd 0.0500 time 0.5146 (0.6038) data time 0.0007 (0.0064) model time 0.5140 (0.5974) loss 7.5702 (7.0300) grad_norm 2.2872 (2.6558) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:10:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][330/625] eta 0:02:57 lr 0.000112 wd 0.0500 time 0.5185 (0.6003) data time 0.0007 (0.0062) model time 0.5178 (0.5941) loss 6.1105 (7.0099) grad_norm 3.3014 (2.6843) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:10:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][340/625] eta 0:02:50 lr 0.000112 wd 0.0500 time 0.5159 (0.5967) data time 0.0010 (0.0059) model time 0.5149 (0.5908) loss 6.2995 (7.0055) grad_norm 2.9041 (2.6677) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:10:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][350/625] eta 0:02:43 lr 0.000112 wd 0.0500 time 0.5173 (0.5941) data time 0.0008 (0.0057) model time 0.5165 (0.5884) loss 6.5755 (7.0091) grad_norm 2.0557 (2.6404) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:10:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][360/625] eta 0:02:36 lr 0.000112 wd 0.0500 time 0.5214 (0.5913) data time 0.0009 (0.0055) model time 0.5204 (0.5858) loss 6.1062 (7.0075) grad_norm 1.6037 (2.6168) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:10:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][370/625] eta 0:02:30 lr 0.000112 wd 0.0500 time 0.5161 (0.5891) data time 0.0011 (0.0053) model time 0.5149 (0.5837) loss 8.3913 (7.0047) grad_norm 1.6820 (2.5999) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:10:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][380/625] eta 0:02:23 lr 0.000112 wd 0.0500 time 0.5158 (0.5866) data time 0.0009 (0.0052) model time 0.5149 (0.5814) loss 7.9219 (7.0021) grad_norm 2.2569 (2.5893) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:10:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][390/625] eta 0:02:17 lr 0.000112 wd 0.0500 time 0.5172 (0.5845) data time 0.0008 (0.0050) model time 0.5164 (0.5795) loss 6.9486 (6.9852) grad_norm 1.8493 (2.5764) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:10:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][400/625] eta 0:02:11 lr 0.000112 wd 0.0500 time 0.5350 (0.5831) data time 0.0010 (0.0049) model time 0.5340 (0.5782) loss 7.8090 (6.9742) grad_norm 1.7154 (2.5725) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:11:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][410/625] eta 0:02:05 lr 0.000112 wd 0.0500 time 0.5497 (0.5821) data time 0.0010 (0.0048) model time 0.5488 (0.5773) loss 7.4805 (6.9819) grad_norm 2.8253 (2.6159) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:11:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][420/625] eta 0:01:58 lr 0.000111 wd 0.0500 time 0.5194 (0.5802) data time 0.0016 (0.0047) model time 0.5178 (0.5755) loss 7.3510 (6.9808) grad_norm 2.1131 (2.6165) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:11:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][430/625] eta 0:01:52 lr 0.000111 wd 0.0500 time 0.5140 (0.5783) data time 0.0008 (0.0045) model time 0.5131 (0.5738) loss 6.8401 (6.9675) grad_norm 1.6241 (2.6071) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:11:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][440/625] eta 0:01:46 lr 0.000111 wd 0.0500 time 0.5194 (0.5768) data time 0.0012 (0.0044) model time 0.5182 (0.5723) loss 8.7744 (6.9785) grad_norm 2.4570 (2.5971) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:11:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][450/625] eta 0:01:40 lr 0.000111 wd 0.0500 time 0.5151 (0.5753) data time 0.0009 (0.0043) model time 0.5142 (0.5710) loss 8.1528 (7.0071) grad_norm 2.0441 (2.5893) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:11:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][460/625] eta 0:01:34 lr 0.000111 wd 0.0500 time 0.5383 (0.5742) data time 0.0016 (0.0042) model time 0.5367 (0.5700) loss 7.6424 (7.0092) grad_norm 1.9279 (2.5783) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:11:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][470/625] eta 0:01:28 lr 0.000111 wd 0.0500 time 0.5175 (0.5727) data time 0.0009 (0.0041) model time 0.5166 (0.5685) loss 6.4478 (7.0132) grad_norm 3.1612 (2.5858) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:11:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][480/625] eta 0:01:22 lr 0.000111 wd 0.0500 time 0.5157 (0.5715) data time 0.0007 (0.0041) model time 0.5150 (0.5674) loss 7.5114 (7.0137) grad_norm 2.0339 (2.5823) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:11:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][490/625] eta 0:01:16 lr 0.000111 wd 0.0500 time 0.5158 (0.5703) data time 0.0007 (0.0040) model time 0.5151 (0.5663) loss 7.3235 (7.0131) grad_norm 2.5100 (2.6371) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:11:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][500/625] eta 0:01:11 lr 0.000111 wd 0.0500 time 0.5182 (0.5693) data time 0.0007 (0.0039) model time 0.5175 (0.5654) loss 7.1095 (7.0173) grad_norm 3.3703 (2.6381) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:11:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][510/625] eta 0:01:05 lr 0.000111 wd 0.0500 time 0.5162 (0.5683) data time 0.0010 (0.0038) model time 0.5152 (0.5644) loss 6.5707 (7.0163) grad_norm 2.3357 (2.6252) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:12:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][520/625] eta 0:00:59 lr 0.000111 wd 0.0500 time 0.5152 (0.5673) data time 0.0011 (0.0038) model time 0.5142 (0.5636) loss 7.8828 (7.0089) grad_norm 3.6901 (2.6472) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:12:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][530/625] eta 0:00:53 lr 0.000111 wd 0.0500 time 0.5155 (0.5664) data time 0.0010 (0.0037) model time 0.5145 (0.5627) loss 6.9687 (7.0094) grad_norm 20.0124 (2.6829) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:12:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][540/625] eta 0:00:48 lr 0.000111 wd 0.0500 time 0.5179 (0.5656) data time 0.0010 (0.0037) model time 0.5169 (0.5620) loss 7.7695 (7.0127) grad_norm 2.5212 (2.6721) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:12:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][550/625] eta 0:00:42 lr 0.000111 wd 0.0500 time 0.5386 (0.5648) data time 0.0008 (0.0036) model time 0.5378 (0.5612) loss 6.9478 (7.0073) grad_norm 4.8503 (2.6725) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:12:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][560/625] eta 0:00:36 lr 0.000111 wd 0.0500 time 0.5177 (0.5643) data time 0.0008 (0.0035) model time 0.5169 (0.5607) loss 7.5573 (7.0227) grad_norm 4.5360 (2.6810) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:12:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][570/625] eta 0:00:30 lr 0.000111 wd 0.0500 time 0.5394 (0.5633) data time 0.0007 (0.0035) model time 0.5387 (0.5599) loss 5.9162 (7.0230) grad_norm 2.3268 (2.6829) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:12:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][580/625] eta 0:00:25 lr 0.000111 wd 0.0500 time 0.5179 (0.5625) data time 0.0008 (0.0034) model time 0.5171 (0.5590) loss 6.2213 (7.0167) grad_norm 2.6293 (2.6788) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:12:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][590/625] eta 0:00:19 lr 0.000110 wd 0.0500 time 0.5456 (0.5619) data time 0.0012 (0.0034) model time 0.5444 (0.5585) loss 6.8198 (7.0098) grad_norm 2.0183 (2.6833) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:12:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][600/625] eta 0:00:14 lr 0.000110 wd 0.0500 time 0.5152 (0.5611) data time 0.0008 (0.0033) model time 0.5145 (0.5578) loss 7.6835 (7.0071) grad_norm 1.6753 (2.6745) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:12:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][610/625] eta 0:00:08 lr 0.000110 wd 0.0500 time 0.5481 (0.5604) data time 0.0006 (0.0033) model time 0.5475 (0.5571) loss 7.8395 (7.0044) grad_norm 2.6330 (2.6723) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:12:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [247/300][620/625] eta 0:00:02 lr 0.000110 wd 0.0500 time 0.5237 (0.5600) data time 0.0007 (0.0033) model time 0.5230 (0.5567) loss 6.3735 (7.0083) grad_norm 2.8533 (2.6694) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 02:12:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 247 training takes 0:04:38 +[2024-07-28 02:12:57 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 02:13:02 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 02:13:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.571 (0.571) Loss 0.4966 (0.4966) Acc@1 90.479 (90.479) Acc@5 99.023 (99.023) Mem 22341MB +[2024-07-28 02:13:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.159) Loss 0.7383 (0.6040) Acc@1 83.350 (88.086) Acc@5 97.119 (98.091) Mem 22341MB +[2024-07-28 02:13:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.139) Loss 0.8379 (0.6927) Acc@1 80.469 (85.396) Acc@5 95.996 (97.261) Mem 22341MB +[2024-07-28 02:13:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.035 Acc@5 97.271 +[2024-07-28 02:13:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.0% +[2024-07-28 02:13:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 85.03% +[2024-07-28 02:13:09 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-28 02:13:14 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-28 02:13:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.564 (0.564) Loss 0.5020 (0.5020) Acc@1 90.479 (90.479) Acc@5 99.023 (99.023) Mem 22341MB +[2024-07-28 02:13:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.158) Loss 0.7427 (0.6108) Acc@1 83.105 (88.064) Acc@5 96.875 (98.109) Mem 22341MB +[2024-07-28 02:13:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.138) Loss 0.8403 (0.6963) Acc@1 80.322 (85.249) Acc@5 96.094 (97.321) Mem 22341MB +[2024-07-28 02:13:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.855 Acc@5 97.313 +[2024-07-28 02:13:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.9% +[2024-07-28 02:13:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.85% +[2024-07-28 02:13:18 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-28 02:13:21 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-28 02:13:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][0/625] eta 0:14:41 lr 0.000110 wd 0.0500 time 1.4109 (1.4109) data time 0.6297 (0.6297) model time 0.0000 (0.0000) loss 7.7660 (7.7660) grad_norm 1.7043 (1.7043) loss_scale 256.0000 (256.0000) mem 22337MB +[2024-07-28 02:13:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][10/625] eta 0:06:12 lr 0.000110 wd 0.0500 time 0.5179 (0.6058) data time 0.0006 (0.0583) model time 0.0000 (0.0000) loss 7.3105 (7.1297) grad_norm 2.8138 (2.4749) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 02:13:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][20/625] eta 0:05:43 lr 0.000110 wd 0.0500 time 0.5301 (0.5678) data time 0.0010 (0.0311) model time 0.0000 (0.0000) loss 7.5250 (7.1012) grad_norm 9.2621 (2.8401) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 02:13:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][30/625] eta 0:05:29 lr 0.000110 wd 0.0500 time 0.5224 (0.5535) data time 0.0007 (0.0214) model time 0.0000 (0.0000) loss 5.4784 (6.8799) grad_norm 2.0473 (3.2702) loss_scale 512.0000 (322.0645) mem 22339MB +[2024-07-28 02:13:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][40/625] eta 0:05:19 lr 0.000110 wd 0.0500 time 0.5239 (0.5466) data time 0.0007 (0.0165) model time 0.0000 (0.0000) loss 6.8576 (6.8332) grad_norm 2.6367 (3.1041) loss_scale 512.0000 (368.3902) mem 22339MB +[2024-07-28 02:13:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][50/625] eta 0:05:12 lr 0.000110 wd 0.0500 time 0.5529 (0.5431) data time 0.0007 (0.0135) model time 0.0000 (0.0000) loss 6.8601 (6.8812) grad_norm 2.2751 (3.1295) loss_scale 512.0000 (396.5490) mem 22339MB +[2024-07-28 02:13:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][60/625] eta 0:05:05 lr 0.000110 wd 0.0500 time 0.5528 (0.5402) data time 0.0010 (0.0115) model time 0.5518 (0.5240) loss 7.3530 (6.9055) grad_norm 2.0340 (3.1565) loss_scale 512.0000 (415.4754) mem 22339MB +[2024-07-28 02:13:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][70/625] eta 0:04:58 lr 0.000110 wd 0.0500 time 0.5593 (0.5385) data time 0.0007 (0.0101) model time 0.5586 (0.5255) loss 7.7029 (6.9500) grad_norm 3.4462 (3.1023) loss_scale 512.0000 (429.0704) mem 22339MB +[2024-07-28 02:14:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][80/625] eta 0:04:52 lr 0.000110 wd 0.0500 time 0.5242 (0.5361) data time 0.0009 (0.0090) model time 0.5233 (0.5230) loss 7.3979 (6.9426) grad_norm 2.0713 (3.0151) loss_scale 512.0000 (439.3086) mem 22339MB +[2024-07-28 02:14:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][90/625] eta 0:04:46 lr 0.000110 wd 0.0500 time 0.5318 (0.5359) data time 0.0010 (0.0081) model time 0.5308 (0.5254) loss 7.1603 (6.9936) grad_norm 2.2933 (3.0032) loss_scale 512.0000 (447.2967) mem 22339MB +[2024-07-28 02:14:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][100/625] eta 0:04:40 lr 0.000110 wd 0.0500 time 0.5288 (0.5349) data time 0.0010 (0.0074) model time 0.5278 (0.5253) loss 5.9982 (6.9879) grad_norm 2.4019 (2.9528) loss_scale 512.0000 (453.7030) mem 22339MB +[2024-07-28 02:14:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][110/625] eta 0:04:35 lr 0.000110 wd 0.0500 time 0.5152 (0.5344) data time 0.0010 (0.0068) model time 0.5142 (0.5257) loss 6.6914 (6.9834) grad_norm 2.1817 (2.9193) loss_scale 512.0000 (458.9550) mem 22339MB +[2024-07-28 02:14:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][120/625] eta 0:04:29 lr 0.000110 wd 0.0500 time 0.5200 (0.5345) data time 0.0012 (0.0064) model time 0.5188 (0.5270) loss 6.1412 (6.9928) grad_norm 1.6587 (2.9080) loss_scale 512.0000 (463.3388) mem 22339MB +[2024-07-28 02:14:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][130/625] eta 0:04:24 lr 0.000110 wd 0.0500 time 0.5190 (0.5340) data time 0.0007 (0.0060) model time 0.5182 (0.5270) loss 5.6688 (6.9638) grad_norm 2.5581 (2.8589) loss_scale 512.0000 (467.0534) mem 22339MB +[2024-07-28 02:14:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][140/625] eta 0:04:18 lr 0.000109 wd 0.0500 time 0.5205 (0.5338) data time 0.0012 (0.0056) model time 0.5194 (0.5274) loss 6.7774 (6.9677) grad_norm 2.6931 (2.8233) loss_scale 512.0000 (470.2411) mem 22339MB +[2024-07-28 02:14:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][150/625] eta 0:04:13 lr 0.000109 wd 0.0500 time 0.5178 (0.5339) data time 0.0012 (0.0053) model time 0.5166 (0.5280) loss 6.4891 (6.9439) grad_norm 1.9064 (2.7834) loss_scale 512.0000 (473.0066) mem 22339MB +[2024-07-28 02:14:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][160/625] eta 0:04:08 lr 0.000109 wd 0.0500 time 0.5230 (0.5337) data time 0.0007 (0.0051) model time 0.5223 (0.5281) loss 6.1267 (6.9133) grad_norm 2.0090 (2.7579) loss_scale 512.0000 (475.4286) mem 22339MB +[2024-07-28 02:14:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][170/625] eta 0:04:02 lr 0.000109 wd 0.0500 time 0.5734 (0.5335) data time 0.0007 (0.0048) model time 0.5727 (0.5283) loss 6.3245 (6.9118) grad_norm 3.9265 (2.7779) loss_scale 512.0000 (477.5673) mem 22339MB +[2024-07-28 02:14:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][180/625] eta 0:03:57 lr 0.000109 wd 0.0500 time 0.5219 (0.5334) data time 0.0011 (0.0046) model time 0.5209 (0.5285) loss 6.9750 (6.9087) grad_norm 2.8140 (2.7735) loss_scale 512.0000 (479.4696) mem 22339MB +[2024-07-28 02:15:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][190/625] eta 0:03:51 lr 0.000109 wd 0.0500 time 0.5209 (0.5332) data time 0.0009 (0.0045) model time 0.5200 (0.5284) loss 6.6450 (6.9162) grad_norm 2.2000 (2.7675) loss_scale 512.0000 (481.1728) mem 22339MB +[2024-07-28 02:15:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][200/625] eta 0:03:46 lr 0.000109 wd 0.0500 time 0.5345 (0.5329) data time 0.0007 (0.0043) model time 0.5338 (0.5282) loss 6.7641 (6.9139) grad_norm 2.4912 (2.7587) loss_scale 512.0000 (482.7065) mem 22339MB +[2024-07-28 02:15:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][210/625] eta 0:03:41 lr 0.000109 wd 0.0500 time 0.5203 (0.5329) data time 0.0009 (0.0041) model time 0.5194 (0.5285) loss 7.8881 (6.9217) grad_norm 2.7467 (2.7468) loss_scale 512.0000 (484.0948) mem 22339MB +[2024-07-28 02:15:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][220/625] eta 0:03:36 lr 0.000109 wd 0.0500 time 0.5328 (0.5337) data time 0.0010 (0.0040) model time 0.5318 (0.5298) loss 6.9438 (6.9134) grad_norm 2.7901 (2.7303) loss_scale 512.0000 (485.3575) mem 22339MB +[2024-07-28 02:15:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][230/625] eta 0:03:30 lr 0.000109 wd 0.0500 time 0.5728 (0.5335) data time 0.0009 (0.0039) model time 0.5719 (0.5296) loss 7.6335 (6.8976) grad_norm 2.4211 (2.7092) loss_scale 512.0000 (486.5108) mem 22339MB +[2024-07-28 02:15:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][240/625] eta 0:03:25 lr 0.000109 wd 0.0500 time 0.5247 (0.5340) data time 0.0011 (0.0038) model time 0.5236 (0.5304) loss 8.3122 (6.9263) grad_norm 3.0245 (2.7152) loss_scale 512.0000 (487.5685) mem 22339MB +[2024-07-28 02:15:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][250/625] eta 0:03:20 lr 0.000109 wd 0.0500 time 0.5330 (0.5340) data time 0.0007 (0.0037) model time 0.5323 (0.5305) loss 5.7778 (6.9325) grad_norm 2.0683 (2.7034) loss_scale 512.0000 (488.5418) mem 22339MB +[2024-07-28 02:15:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][260/625] eta 0:03:14 lr 0.000109 wd 0.0500 time 0.5263 (0.5341) data time 0.0006 (0.0036) model time 0.5256 (0.5307) loss 8.0940 (6.9377) grad_norm 2.0722 (2.6835) loss_scale 512.0000 (489.4406) mem 22339MB +[2024-07-28 02:15:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][270/625] eta 0:03:09 lr 0.000109 wd 0.0500 time 0.5692 (0.5344) data time 0.0009 (0.0035) model time 0.5683 (0.5311) loss 7.3500 (6.9468) grad_norm 2.8398 (2.6690) loss_scale 512.0000 (490.2731) mem 22339MB +[2024-07-28 02:15:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][280/625] eta 0:03:04 lr 0.000109 wd 0.0500 time 0.5930 (0.5345) data time 0.0015 (0.0034) model time 0.5915 (0.5314) loss 7.5841 (6.9609) grad_norm 2.4360 (2.6605) loss_scale 512.0000 (491.0463) mem 22339MB +[2024-07-28 02:15:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][290/625] eta 0:02:59 lr 0.000109 wd 0.0500 time 0.5396 (0.5346) data time 0.0008 (0.0034) model time 0.5388 (0.5315) loss 7.4198 (6.9608) grad_norm 2.2001 (2.6616) loss_scale 512.0000 (491.7663) mem 22339MB +[2024-07-28 02:16:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][300/625] eta 0:02:53 lr 0.000109 wd 0.0500 time 0.5577 (0.5346) data time 0.0008 (0.0033) model time 0.5569 (0.5316) loss 5.9937 (6.9650) grad_norm 2.2502 (2.6640) loss_scale 512.0000 (492.4385) mem 22339MB +[2024-07-28 02:16:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][310/625] eta 0:02:48 lr 0.000108 wd 0.0500 time 0.5437 (0.5344) data time 0.0012 (0.0033) model time 0.5425 (0.5315) loss 7.7910 (6.9603) grad_norm 5.5723 (2.6658) loss_scale 512.0000 (493.0675) mem 22339MB +[2024-07-28 02:16:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][320/625] eta 0:02:43 lr 0.000108 wd 0.0500 time 0.5243 (0.5346) data time 0.0009 (0.0032) model time 0.5234 (0.5318) loss 8.1950 (6.9588) grad_norm 2.3999 (2.6554) loss_scale 512.0000 (493.6573) mem 22339MB +[2024-07-28 02:16:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][330/625] eta 0:02:37 lr 0.000108 wd 0.0500 time 0.5988 (0.5344) data time 0.0010 (0.0031) model time 0.5978 (0.5316) loss 7.4596 (6.9519) grad_norm 1.8209 (2.6472) loss_scale 512.0000 (494.2115) mem 22339MB +[2024-07-28 02:16:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][340/625] eta 0:02:32 lr 0.000108 wd 0.0500 time 0.5608 (0.5350) data time 0.0008 (0.0031) model time 0.5601 (0.5324) loss 7.1623 (6.9481) grad_norm 4.1195 (2.6775) loss_scale 512.0000 (494.7331) mem 22339MB +[2024-07-28 02:16:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][350/625] eta 0:02:27 lr 0.000108 wd 0.0500 time 0.5630 (0.5351) data time 0.0011 (0.0030) model time 0.5618 (0.5326) loss 5.5274 (6.9360) grad_norm 3.2615 (2.6733) loss_scale 512.0000 (495.2251) mem 22339MB +[2024-07-28 02:16:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][360/625] eta 0:02:21 lr 0.000108 wd 0.0500 time 0.5398 (0.5349) data time 0.0007 (0.0030) model time 0.5391 (0.5323) loss 6.3584 (6.9382) grad_norm 2.4506 (2.6768) loss_scale 512.0000 (495.6898) mem 22339MB +[2024-07-28 02:16:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][370/625] eta 0:02:16 lr 0.000108 wd 0.0500 time 0.5165 (0.5350) data time 0.0009 (0.0030) model time 0.5156 (0.5323) loss 7.1675 (6.9396) grad_norm 2.2302 (2.6645) loss_scale 512.0000 (496.1294) mem 22339MB +[2024-07-28 02:16:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][380/625] eta 0:02:11 lr 0.000108 wd 0.0500 time 0.5262 (0.5350) data time 0.0012 (0.0030) model time 0.5250 (0.5324) loss 7.6058 (6.9364) grad_norm 2.1090 (2.6535) loss_scale 512.0000 (496.5459) mem 22339MB +[2024-07-28 02:16:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][390/625] eta 0:02:05 lr 0.000108 wd 0.0500 time 0.5543 (0.5349) data time 0.0013 (0.0029) model time 0.5530 (0.5323) loss 8.5880 (6.9377) grad_norm 2.0576 (2.6501) loss_scale 512.0000 (496.9412) mem 22339MB +[2024-07-28 02:16:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][400/625] eta 0:02:00 lr 0.000108 wd 0.0500 time 0.5439 (0.5348) data time 0.0008 (0.0029) model time 0.5431 (0.5322) loss 6.4267 (6.9359) grad_norm 2.9937 (2.6507) loss_scale 512.0000 (497.3167) mem 22339MB +[2024-07-28 02:17:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][410/625] eta 0:01:54 lr 0.000108 wd 0.0500 time 0.5220 (0.5347) data time 0.0010 (0.0028) model time 0.5211 (0.5322) loss 6.4854 (6.9351) grad_norm 4.0453 (2.6654) loss_scale 512.0000 (497.6740) mem 22339MB +[2024-07-28 02:17:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][420/625] eta 0:01:49 lr 0.000108 wd 0.0500 time 0.5481 (0.5347) data time 0.0009 (0.0028) model time 0.5472 (0.5322) loss 6.1706 (6.9351) grad_norm 1.8630 (2.6659) loss_scale 512.0000 (498.0143) mem 22339MB +[2024-07-28 02:17:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][430/625] eta 0:01:44 lr 0.000108 wd 0.0500 time 0.5170 (0.5347) data time 0.0007 (0.0028) model time 0.5163 (0.5322) loss 6.6992 (6.9424) grad_norm 3.4213 (2.6699) loss_scale 512.0000 (498.3387) mem 22339MB +[2024-07-28 02:17:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][440/625] eta 0:01:38 lr 0.000108 wd 0.0500 time 0.5467 (0.5348) data time 0.0007 (0.0027) model time 0.5460 (0.5324) loss 6.3121 (6.9429) grad_norm 2.4547 (2.6725) loss_scale 512.0000 (498.6485) mem 22339MB +[2024-07-28 02:17:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][450/625] eta 0:01:33 lr 0.000108 wd 0.0500 time 0.5448 (0.5347) data time 0.0008 (0.0027) model time 0.5439 (0.5322) loss 7.0366 (6.9360) grad_norm 3.2827 (2.6776) loss_scale 512.0000 (498.9446) mem 22339MB +[2024-07-28 02:17:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][460/625] eta 0:01:28 lr 0.000108 wd 0.0500 time 0.5193 (0.5344) data time 0.0008 (0.0027) model time 0.5185 (0.5320) loss 7.7674 (6.9338) grad_norm 2.6595 (2.6867) loss_scale 512.0000 (499.2278) mem 22339MB +[2024-07-28 02:17:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][470/625] eta 0:01:22 lr 0.000108 wd 0.0500 time 0.5173 (0.5343) data time 0.0012 (0.0027) model time 0.5161 (0.5319) loss 6.9677 (6.9382) grad_norm 2.1067 (2.6889) loss_scale 512.0000 (499.4989) mem 22339MB +[2024-07-28 02:17:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][480/625] eta 0:01:17 lr 0.000107 wd 0.0500 time 0.5197 (0.5342) data time 0.0009 (0.0026) model time 0.5188 (0.5318) loss 5.5864 (6.9426) grad_norm 3.4088 (2.7085) loss_scale 512.0000 (499.7588) mem 22339MB +[2024-07-28 02:17:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][490/625] eta 0:01:12 lr 0.000107 wd 0.0500 time 0.5622 (0.5341) data time 0.0011 (0.0026) model time 0.5612 (0.5317) loss 6.0741 (6.9437) grad_norm 1.7699 (2.7035) loss_scale 512.0000 (500.0081) mem 22339MB +[2024-07-28 02:17:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][500/625] eta 0:01:06 lr 0.000107 wd 0.0500 time 0.5688 (0.5339) data time 0.0009 (0.0026) model time 0.5679 (0.5315) loss 7.5383 (6.9445) grad_norm 7.5775 (2.7052) loss_scale 512.0000 (500.2475) mem 22339MB +[2024-07-28 02:17:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][510/625] eta 0:01:01 lr 0.000107 wd 0.0500 time 0.5200 (0.5337) data time 0.0007 (0.0026) model time 0.5193 (0.5314) loss 7.8199 (6.9481) grad_norm 2.9098 (2.6992) loss_scale 512.0000 (500.4775) mem 22339MB +[2024-07-28 02:17:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][520/625] eta 0:00:56 lr 0.000107 wd 0.0500 time 0.5304 (0.5336) data time 0.0014 (0.0025) model time 0.5291 (0.5313) loss 6.7435 (6.9532) grad_norm 2.3311 (2.7158) loss_scale 512.0000 (500.6987) mem 22339MB +[2024-07-28 02:18:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][530/625] eta 0:00:50 lr 0.000107 wd 0.0500 time 0.5214 (0.5335) data time 0.0013 (0.0025) model time 0.5201 (0.5311) loss 6.5834 (6.9530) grad_norm 2.3804 (2.7286) loss_scale 512.0000 (500.9115) mem 22339MB +[2024-07-28 02:18:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][540/625] eta 0:00:45 lr 0.000107 wd 0.0500 time 0.5658 (0.5334) data time 0.0006 (0.0025) model time 0.5652 (0.5310) loss 8.2060 (6.9563) grad_norm 6.4988 (2.7433) loss_scale 512.0000 (501.1165) mem 22339MB +[2024-07-28 02:18:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][550/625] eta 0:00:39 lr 0.000107 wd 0.0500 time 0.5201 (0.5332) data time 0.0010 (0.0024) model time 0.5190 (0.5308) loss 6.0400 (6.9538) grad_norm 2.4650 (2.7436) loss_scale 512.0000 (501.3140) mem 22339MB +[2024-07-28 02:18:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][560/625] eta 0:00:34 lr 0.000107 wd 0.0500 time 0.5200 (0.5334) data time 0.0010 (0.0024) model time 0.5190 (0.5311) loss 8.4035 (6.9577) grad_norm 3.0317 (2.7516) loss_scale 512.0000 (501.5045) mem 22339MB +[2024-07-28 02:18:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][570/625] eta 0:00:29 lr 0.000107 wd 0.0500 time 0.5174 (0.5332) data time 0.0009 (0.0024) model time 0.5166 (0.5309) loss 7.7011 (6.9565) grad_norm 2.5839 (2.7640) loss_scale 512.0000 (501.6883) mem 22339MB +[2024-07-28 02:18:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][580/625] eta 0:00:23 lr 0.000107 wd 0.0500 time 0.5296 (0.5332) data time 0.0009 (0.0024) model time 0.5287 (0.5310) loss 7.5156 (6.9538) grad_norm 2.7342 (2.7673) loss_scale 512.0000 (501.8657) mem 22339MB +[2024-07-28 02:18:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][590/625] eta 0:00:18 lr 0.000107 wd 0.0500 time 0.5174 (0.5330) data time 0.0007 (0.0023) model time 0.5166 (0.5307) loss 5.8038 (6.9497) grad_norm 2.8347 (2.7604) loss_scale 512.0000 (502.0372) mem 22339MB +[2024-07-28 02:18:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][600/625] eta 0:00:13 lr 0.000107 wd 0.0500 time 0.5211 (0.5329) data time 0.0007 (0.0023) model time 0.5204 (0.5307) loss 6.1534 (6.9507) grad_norm 2.5827 (2.7583) loss_scale 512.0000 (502.2030) mem 22339MB +[2024-07-28 02:18:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][610/625] eta 0:00:07 lr 0.000107 wd 0.0500 time 0.5147 (0.5329) data time 0.0005 (0.0023) model time 0.5142 (0.5307) loss 6.2881 (6.9537) grad_norm 2.0740 (2.7466) loss_scale 512.0000 (502.3633) mem 22339MB +[2024-07-28 02:18:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [248/300][620/625] eta 0:00:02 lr 0.000107 wd 0.0500 time 0.5143 (0.5327) data time 0.0005 (0.0023) model time 0.5138 (0.5305) loss 7.7652 (6.9582) grad_norm 1.9403 (2.7447) loss_scale 512.0000 (502.5185) mem 22339MB +[2024-07-28 02:18:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 248 training takes 0:05:32 +[2024-07-28 02:18:54 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 02:18:55 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 02:18:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.590 (0.590) Loss 0.4980 (0.4980) Acc@1 90.137 (90.137) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-28 02:18:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.160) Loss 0.7510 (0.6051) Acc@1 83.447 (88.033) Acc@5 97.217 (98.140) Mem 22339MB +[2024-07-28 02:18:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.139) Loss 0.8237 (0.6915) Acc@1 80.762 (85.410) Acc@5 96.533 (97.370) Mem 22339MB +[2024-07-28 02:18:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.013 Acc@5 97.361 +[2024-07-28 02:18:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.0% +[2024-07-28 02:18:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.977 (0.977) Loss 0.5015 (0.5015) Acc@1 90.479 (90.479) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-28 02:19:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.117 (0.197) Loss 0.7422 (0.6103) Acc@1 82.959 (88.086) Acc@5 96.875 (98.127) Mem 22339MB +[2024-07-28 02:19:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.158) Loss 0.8394 (0.6956) Acc@1 80.469 (85.268) Acc@5 96.094 (97.331) Mem 22339MB +[2024-07-28 02:19:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.865 Acc@5 97.323 +[2024-07-28 02:19:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.9% +[2024-07-28 02:19:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.86% +[2024-07-28 02:19:02 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-28 02:19:04 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-28 02:19:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][0/625] eta 0:11:20 lr 0.000107 wd 0.0500 time 1.0880 (1.0880) data time 0.5681 (0.5681) model time 0.0000 (0.0000) loss 7.1346 (7.1346) grad_norm 2.8374 (2.8374) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:19:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][10/625] eta 0:05:57 lr 0.000107 wd 0.0500 time 0.5181 (0.5811) data time 0.0010 (0.0527) model time 0.0000 (0.0000) loss 7.1467 (7.0169) grad_norm 2.4385 (2.3439) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:19:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][20/625] eta 0:05:33 lr 0.000107 wd 0.0500 time 0.5178 (0.5515) data time 0.0006 (0.0281) model time 0.0000 (0.0000) loss 6.9783 (7.1102) grad_norm 1.8352 (2.6674) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:19:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][30/625] eta 0:05:23 lr 0.000106 wd 0.0500 time 0.5185 (0.5437) data time 0.0007 (0.0194) model time 0.0000 (0.0000) loss 7.9753 (7.0607) grad_norm 2.9122 (2.6492) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:19:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][40/625] eta 0:05:17 lr 0.000106 wd 0.0500 time 0.5169 (0.5425) data time 0.0010 (0.0149) model time 0.0000 (0.0000) loss 5.9265 (6.9644) grad_norm 1.8835 (2.8251) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:19:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][50/625] eta 0:05:10 lr 0.000106 wd 0.0500 time 0.5183 (0.5392) data time 0.0007 (0.0122) model time 0.0000 (0.0000) loss 6.1537 (6.9183) grad_norm 2.3478 (2.8250) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:19:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][60/625] eta 0:05:03 lr 0.000106 wd 0.0500 time 0.5388 (0.5365) data time 0.0007 (0.0103) model time 0.5381 (0.5215) loss 6.5693 (7.0036) grad_norm 11.5018 (2.9127) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:19:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][70/625] eta 0:04:57 lr 0.000106 wd 0.0500 time 0.5195 (0.5359) data time 0.0010 (0.0091) model time 0.5185 (0.5262) loss 6.0275 (6.9802) grad_norm 1.6101 (2.8310) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:19:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][80/625] eta 0:04:51 lr 0.000106 wd 0.0500 time 0.5206 (0.5345) data time 0.0008 (0.0081) model time 0.5198 (0.5254) loss 8.0204 (6.9991) grad_norm 1.8438 (2.7446) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:19:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][90/625] eta 0:04:47 lr 0.000106 wd 0.0500 time 0.5194 (0.5366) data time 0.0007 (0.0073) model time 0.5187 (0.5321) loss 7.1841 (6.9882) grad_norm 1.8522 (2.7170) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:19:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][100/625] eta 0:04:41 lr 0.000106 wd 0.0500 time 0.5776 (0.5362) data time 0.0010 (0.0067) model time 0.5766 (0.5321) loss 7.4237 (6.9915) grad_norm 2.1243 (2.6787) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:20:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][110/625] eta 0:04:35 lr 0.000106 wd 0.0500 time 0.5937 (0.5355) data time 0.0013 (0.0062) model time 0.5924 (0.5312) loss 6.9134 (6.9788) grad_norm 2.5465 (2.6691) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:20:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][120/625] eta 0:04:29 lr 0.000106 wd 0.0500 time 0.5264 (0.5344) data time 0.0015 (0.0058) model time 0.5250 (0.5298) loss 7.6590 (6.9814) grad_norm 2.9057 (2.7089) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:20:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][130/625] eta 0:04:24 lr 0.000106 wd 0.0500 time 0.5184 (0.5335) data time 0.0013 (0.0054) model time 0.5171 (0.5287) loss 6.3550 (6.9411) grad_norm 17.1960 (2.8098) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:20:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][140/625] eta 0:04:18 lr 0.000106 wd 0.0500 time 0.5222 (0.5332) data time 0.0011 (0.0051) model time 0.5211 (0.5286) loss 6.8145 (6.9423) grad_norm 2.5740 (2.8730) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:20:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][150/625] eta 0:04:13 lr 0.000106 wd 0.0500 time 0.5183 (0.5327) data time 0.0010 (0.0049) model time 0.5173 (0.5282) loss 7.5336 (6.9642) grad_norm 2.7976 (2.8495) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:20:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][160/625] eta 0:04:07 lr 0.000106 wd 0.0500 time 0.5253 (0.5324) data time 0.0014 (0.0046) model time 0.5239 (0.5281) loss 7.6990 (6.9969) grad_norm 1.9739 (2.8282) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:20:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][170/625] eta 0:04:01 lr 0.000106 wd 0.0500 time 0.5186 (0.5316) data time 0.0010 (0.0044) model time 0.5176 (0.5273) loss 7.6075 (6.9987) grad_norm 2.1846 (2.7975) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:20:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][180/625] eta 0:03:56 lr 0.000106 wd 0.0500 time 0.5178 (0.5315) data time 0.0012 (0.0042) model time 0.5167 (0.5273) loss 7.1810 (6.9887) grad_norm 1.5489 (2.7746) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:20:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][190/625] eta 0:03:51 lr 0.000106 wd 0.0500 time 0.5189 (0.5312) data time 0.0009 (0.0041) model time 0.5180 (0.5271) loss 6.0207 (6.9670) grad_norm 2.9006 (2.7622) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:20:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][200/625] eta 0:03:45 lr 0.000105 wd 0.0500 time 0.5195 (0.5312) data time 0.0009 (0.0039) model time 0.5186 (0.5273) loss 6.7328 (6.9489) grad_norm 2.0006 (2.7462) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:20:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][210/625] eta 0:03:40 lr 0.000105 wd 0.0500 time 0.5392 (0.5311) data time 0.0009 (0.0038) model time 0.5383 (0.5274) loss 7.6677 (6.9558) grad_norm 2.0400 (2.7440) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:21:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][220/625] eta 0:03:35 lr 0.000105 wd 0.0500 time 0.5241 (0.5309) data time 0.0010 (0.0037) model time 0.5231 (0.5273) loss 6.7887 (6.9681) grad_norm 2.0184 (2.7303) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:21:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][230/625] eta 0:03:29 lr 0.000105 wd 0.0500 time 0.5211 (0.5306) data time 0.0007 (0.0036) model time 0.5204 (0.5271) loss 6.7979 (6.9804) grad_norm 2.2698 (2.7092) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:21:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][240/625] eta 0:03:24 lr 0.000105 wd 0.0500 time 0.5211 (0.5306) data time 0.0010 (0.0035) model time 0.5201 (0.5271) loss 8.8261 (7.0056) grad_norm 1.8258 (2.7366) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:21:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][250/625] eta 0:03:18 lr 0.000105 wd 0.0500 time 0.5214 (0.5302) data time 0.0010 (0.0034) model time 0.5205 (0.5268) loss 6.5728 (7.0162) grad_norm 2.0918 (2.7419) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:21:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][260/625] eta 0:03:13 lr 0.000105 wd 0.0500 time 0.5306 (0.5309) data time 0.0012 (0.0033) model time 0.5294 (0.5277) loss 7.9868 (7.0128) grad_norm 13.6609 (2.8700) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:21:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][270/625] eta 0:03:08 lr 0.000105 wd 0.0500 time 0.5702 (0.5308) data time 0.0007 (0.0032) model time 0.5695 (0.5277) loss 6.1509 (7.0070) grad_norm 1.9153 (2.8541) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:21:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][280/625] eta 0:03:03 lr 0.000105 wd 0.0500 time 0.5192 (0.5309) data time 0.0007 (0.0032) model time 0.5184 (0.5279) loss 8.0986 (7.0186) grad_norm 2.6740 (2.8388) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:21:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][290/625] eta 0:02:57 lr 0.000105 wd 0.0500 time 0.5380 (0.5311) data time 0.0014 (0.0031) model time 0.5366 (0.5282) loss 7.5118 (7.0246) grad_norm 3.3855 (2.8422) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:21:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][300/625] eta 0:02:52 lr 0.000105 wd 0.0500 time 0.6792 (0.5313) data time 0.0007 (0.0030) model time 0.6785 (0.5285) loss 6.8058 (7.0260) grad_norm 2.0029 (2.8342) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:21:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][310/625] eta 0:02:47 lr 0.000105 wd 0.0500 time 0.5790 (0.5313) data time 0.0010 (0.0030) model time 0.5779 (0.5286) loss 8.3543 (7.0274) grad_norm 2.0862 (2.8222) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:21:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][320/625] eta 0:02:41 lr 0.000105 wd 0.0500 time 0.5472 (0.5311) data time 0.0010 (0.0029) model time 0.5462 (0.5283) loss 7.6929 (7.0227) grad_norm 2.7677 (2.8026) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:22:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][330/625] eta 0:02:36 lr 0.000105 wd 0.0500 time 0.5175 (0.5314) data time 0.0009 (0.0030) model time 0.5166 (0.5287) loss 6.2504 (7.0201) grad_norm 2.1188 (2.7946) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:22:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][340/625] eta 0:02:31 lr 0.000105 wd 0.0500 time 0.5347 (0.5314) data time 0.0010 (0.0030) model time 0.5336 (0.5287) loss 7.0106 (7.0255) grad_norm 3.1992 (2.7848) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:22:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][350/625] eta 0:02:26 lr 0.000105 wd 0.0500 time 0.5312 (0.5314) data time 0.0007 (0.0029) model time 0.5305 (0.5287) loss 6.6119 (7.0266) grad_norm 1.8193 (2.7772) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:22:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][360/625] eta 0:02:20 lr 0.000105 wd 0.0500 time 0.5328 (0.5310) data time 0.0007 (0.0029) model time 0.5321 (0.5284) loss 6.2933 (7.0054) grad_norm 2.0951 (2.7790) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:22:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][370/625] eta 0:02:15 lr 0.000104 wd 0.0500 time 0.5294 (0.5311) data time 0.0011 (0.0028) model time 0.5284 (0.5285) loss 6.0517 (6.9964) grad_norm 2.1631 (2.7752) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:22:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][380/625] eta 0:02:10 lr 0.000104 wd 0.0500 time 0.5210 (0.5311) data time 0.0016 (0.0028) model time 0.5194 (0.5285) loss 7.2949 (6.9951) grad_norm 2.0675 (2.7655) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:22:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][390/625] eta 0:02:04 lr 0.000104 wd 0.0500 time 0.5483 (0.5312) data time 0.0007 (0.0027) model time 0.5476 (0.5287) loss 7.0729 (7.0019) grad_norm 2.2139 (2.7524) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:22:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][400/625] eta 0:01:59 lr 0.000104 wd 0.0500 time 0.5497 (0.5312) data time 0.0013 (0.0027) model time 0.5485 (0.5287) loss 7.1024 (7.0075) grad_norm 3.5107 (2.7355) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:22:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][410/625] eta 0:01:54 lr 0.000104 wd 0.0500 time 0.5200 (0.5313) data time 0.0010 (0.0027) model time 0.5190 (0.5289) loss 6.6854 (7.0074) grad_norm 1.6175 (2.7381) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:22:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][420/625] eta 0:01:48 lr 0.000104 wd 0.0500 time 0.5412 (0.5313) data time 0.0008 (0.0026) model time 0.5404 (0.5289) loss 6.3282 (7.0119) grad_norm 4.1800 (2.7485) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:22:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][430/625] eta 0:01:43 lr 0.000104 wd 0.0500 time 0.5259 (0.5313) data time 0.0009 (0.0026) model time 0.5250 (0.5290) loss 7.1376 (7.0014) grad_norm 1.9205 (2.7544) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:22:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][440/625] eta 0:01:38 lr 0.000104 wd 0.0500 time 0.5879 (0.5315) data time 0.0016 (0.0026) model time 0.5863 (0.5292) loss 5.9520 (6.9968) grad_norm 2.5153 (2.7443) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:23:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][450/625] eta 0:01:33 lr 0.000104 wd 0.0500 time 0.5156 (0.5317) data time 0.0008 (0.0025) model time 0.5148 (0.5294) loss 6.9523 (6.9966) grad_norm 2.9550 (2.7602) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:23:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][460/625] eta 0:01:27 lr 0.000104 wd 0.0500 time 0.5451 (0.5318) data time 0.0014 (0.0025) model time 0.5438 (0.5295) loss 7.0272 (6.9952) grad_norm 2.2316 (2.7612) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:23:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][470/625] eta 0:01:22 lr 0.000104 wd 0.0500 time 0.5177 (0.5316) data time 0.0009 (0.0025) model time 0.5168 (0.5293) loss 7.8127 (7.0006) grad_norm 7.6364 (2.7804) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:23:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][480/625] eta 0:01:17 lr 0.000104 wd 0.0500 time 0.5188 (0.5323) data time 0.0011 (0.0025) model time 0.5178 (0.5301) loss 8.2349 (7.0061) grad_norm 4.7003 (2.7877) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:23:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][490/625] eta 0:01:11 lr 0.000104 wd 0.0500 time 0.5923 (0.5324) data time 0.0007 (0.0025) model time 0.5916 (0.5303) loss 6.9787 (7.0055) grad_norm 5.9408 (2.7904) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:23:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][500/625] eta 0:01:06 lr 0.000104 wd 0.0500 time 0.5401 (0.5328) data time 0.0007 (0.0025) model time 0.5394 (0.5307) loss 7.3983 (7.0048) grad_norm 2.0788 (2.7854) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:23:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][510/625] eta 0:01:01 lr 0.000104 wd 0.0500 time 0.6085 (0.5328) data time 0.0013 (0.0025) model time 0.6072 (0.5307) loss 7.2647 (7.0116) grad_norm 2.6698 (2.7834) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:23:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][520/625] eta 0:00:55 lr 0.000104 wd 0.0500 time 0.5170 (0.5327) data time 0.0007 (0.0025) model time 0.5162 (0.5305) loss 6.1094 (7.0077) grad_norm 1.6118 (2.7705) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:23:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][530/625] eta 0:00:50 lr 0.000104 wd 0.0500 time 0.5148 (0.5330) data time 0.0010 (0.0024) model time 0.5138 (0.5309) loss 6.1187 (6.9965) grad_norm 2.3706 (2.7657) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:23:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][540/625] eta 0:00:45 lr 0.000104 wd 0.0500 time 0.5265 (0.5330) data time 0.0010 (0.0024) model time 0.5256 (0.5309) loss 7.0811 (6.9881) grad_norm 3.9210 (2.7595) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:23:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][550/625] eta 0:00:39 lr 0.000103 wd 0.0500 time 0.5165 (0.5329) data time 0.0008 (0.0024) model time 0.5157 (0.5309) loss 7.4592 (6.9830) grad_norm 2.6720 (2.7515) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:24:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][560/625] eta 0:00:34 lr 0.000103 wd 0.0500 time 0.5478 (0.5328) data time 0.0011 (0.0024) model time 0.5467 (0.5308) loss 6.2719 (6.9817) grad_norm 1.7185 (2.7404) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:24:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][570/625] eta 0:00:29 lr 0.000103 wd 0.0500 time 0.5487 (0.5327) data time 0.0013 (0.0024) model time 0.5474 (0.5307) loss 6.7841 (6.9845) grad_norm 1.6816 (2.7258) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:24:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][580/625] eta 0:00:23 lr 0.000103 wd 0.0500 time 0.5185 (0.5327) data time 0.0011 (0.0023) model time 0.5174 (0.5307) loss 7.0221 (6.9943) grad_norm 5.4982 (2.7333) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:24:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][590/625] eta 0:00:18 lr 0.000103 wd 0.0500 time 0.5247 (0.5326) data time 0.0014 (0.0023) model time 0.5233 (0.5306) loss 7.8500 (6.9923) grad_norm 2.2796 (2.7275) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:24:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][600/625] eta 0:00:13 lr 0.000103 wd 0.0500 time 0.5204 (0.5325) data time 0.0009 (0.0023) model time 0.5195 (0.5305) loss 7.2093 (6.9929) grad_norm 2.3785 (2.7206) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:24:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][610/625] eta 0:00:07 lr 0.000103 wd 0.0500 time 0.5222 (0.5325) data time 0.0007 (0.0023) model time 0.5214 (0.5305) loss 6.6578 (6.9956) grad_norm 1.8395 (2.7159) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:24:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [249/300][620/625] eta 0:00:02 lr 0.000103 wd 0.0500 time 0.5190 (0.5322) data time 0.0005 (0.0022) model time 0.5185 (0.5302) loss 8.6079 (6.9993) grad_norm 2.0252 (2.7052) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:24:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 249 training takes 0:05:32 +[2024-07-28 02:24:37 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 02:24:39 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 02:24:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.596 (0.596) Loss 0.4900 (0.4900) Acc@1 90.430 (90.430) Acc@5 99.072 (99.072) Mem 22339MB +[2024-07-28 02:24:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.161) Loss 0.7378 (0.5969) Acc@1 83.252 (87.971) Acc@5 97.217 (98.184) Mem 22339MB +[2024-07-28 02:24:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.140) Loss 0.8193 (0.6843) Acc@1 80.908 (85.366) Acc@5 96.191 (97.370) Mem 22339MB +[2024-07-28 02:24:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.977 Acc@5 97.361 +[2024-07-28 02:24:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.0% +[2024-07-28 02:24:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.983 (0.983) Loss 0.5010 (0.5010) Acc@1 90.430 (90.430) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-28 02:24:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.199) Loss 0.7422 (0.6098) Acc@1 82.910 (88.059) Acc@5 96.875 (98.127) Mem 22339MB +[2024-07-28 02:24:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.160) Loss 0.8394 (0.6951) Acc@1 80.566 (85.263) Acc@5 96.094 (97.342) Mem 22339MB +[2024-07-28 02:24:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.859 Acc@5 97.337 +[2024-07-28 02:24:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.9% +[2024-07-28 02:24:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][0/625] eta 0:16:33 lr 0.000103 wd 0.0500 time 1.5902 (1.5902) data time 0.7366 (0.7366) model time 0.0000 (0.0000) loss 6.9478 (6.9478) grad_norm 1.7915 (1.7915) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:24:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][10/625] eta 0:06:22 lr 0.000103 wd 0.0500 time 0.5174 (0.6222) data time 0.0007 (0.0681) model time 0.0000 (0.0000) loss 6.4046 (6.6125) grad_norm 3.3973 (2.4389) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:24:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][20/625] eta 0:05:49 lr 0.000103 wd 0.0500 time 0.5200 (0.5771) data time 0.0007 (0.0361) model time 0.0000 (0.0000) loss 6.4660 (6.8273) grad_norm 2.5528 (2.6523) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:25:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][30/625] eta 0:05:34 lr 0.000103 wd 0.0500 time 0.5460 (0.5617) data time 0.0009 (0.0248) model time 0.0000 (0.0000) loss 6.6998 (6.9675) grad_norm 1.8116 (2.5115) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:25:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][40/625] eta 0:05:23 lr 0.000103 wd 0.0500 time 0.5646 (0.5528) data time 0.0014 (0.0192) model time 0.0000 (0.0000) loss 7.3983 (6.9718) grad_norm 1.9394 (2.5060) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:25:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][50/625] eta 0:05:15 lr 0.000103 wd 0.0500 time 0.5376 (0.5484) data time 0.0007 (0.0156) model time 0.0000 (0.0000) loss 7.2313 (7.0420) grad_norm 2.4784 (2.4552) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:25:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][60/625] eta 0:05:08 lr 0.000103 wd 0.0500 time 0.5165 (0.5455) data time 0.0012 (0.0133) model time 0.5152 (0.5294) loss 7.3861 (7.0135) grad_norm 3.2521 (2.4140) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:25:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][70/625] eta 0:05:03 lr 0.000103 wd 0.0500 time 0.7218 (0.5473) data time 0.0010 (0.0115) model time 0.7208 (0.5435) loss 6.2554 (7.0078) grad_norm 2.5266 (2.4509) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:25:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][80/625] eta 0:04:57 lr 0.000103 wd 0.0500 time 0.5259 (0.5450) data time 0.0017 (0.0103) model time 0.5242 (0.5381) loss 7.6434 (7.0193) grad_norm 1.9414 (2.5226) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:25:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][90/625] eta 0:04:51 lr 0.000103 wd 0.0500 time 0.5185 (0.5443) data time 0.0007 (0.0093) model time 0.5178 (0.5379) loss 5.9020 (7.0398) grad_norm 3.0804 (2.5622) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:25:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][100/625] eta 0:04:44 lr 0.000102 wd 0.0500 time 0.5179 (0.5427) data time 0.0011 (0.0084) model time 0.5168 (0.5359) loss 6.5979 (7.0231) grad_norm 2.6769 (2.5555) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:25:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][110/625] eta 0:04:38 lr 0.000102 wd 0.0500 time 0.5192 (0.5411) data time 0.0009 (0.0078) model time 0.5183 (0.5339) loss 6.5270 (7.0119) grad_norm 2.1148 (2.5261) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:25:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][120/625] eta 0:04:32 lr 0.000102 wd 0.0500 time 0.5170 (0.5401) data time 0.0009 (0.0072) model time 0.5160 (0.5330) loss 7.4056 (7.0001) grad_norm 1.7743 (2.5453) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:25:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][130/625] eta 0:04:26 lr 0.000102 wd 0.0500 time 0.5267 (0.5387) data time 0.0009 (0.0067) model time 0.5258 (0.5315) loss 6.3791 (7.0058) grad_norm 2.3225 (2.5479) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:26:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][140/625] eta 0:04:20 lr 0.000102 wd 0.0500 time 0.5175 (0.5377) data time 0.0009 (0.0063) model time 0.5166 (0.5306) loss 7.0432 (7.0339) grad_norm 4.8677 (2.5657) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:26:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][150/625] eta 0:04:14 lr 0.000102 wd 0.0500 time 0.5311 (0.5366) data time 0.0013 (0.0060) model time 0.5298 (0.5296) loss 5.7440 (7.0058) grad_norm 1.8834 (2.5528) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:26:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][160/625] eta 0:04:09 lr 0.000102 wd 0.0500 time 0.5170 (0.5363) data time 0.0010 (0.0057) model time 0.5160 (0.5296) loss 6.2299 (7.0055) grad_norm 7.4430 (2.5930) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:26:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][170/625] eta 0:04:03 lr 0.000102 wd 0.0500 time 0.5754 (0.5355) data time 0.0010 (0.0054) model time 0.5744 (0.5289) loss 6.3345 (6.9946) grad_norm 2.1565 (2.5840) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:26:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][180/625] eta 0:03:58 lr 0.000102 wd 0.0500 time 0.5170 (0.5350) data time 0.0009 (0.0052) model time 0.5161 (0.5287) loss 8.7009 (6.9817) grad_norm 1.7384 (2.5572) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:26:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][190/625] eta 0:03:52 lr 0.000102 wd 0.0500 time 0.5653 (0.5345) data time 0.0007 (0.0050) model time 0.5646 (0.5284) loss 6.4140 (6.9820) grad_norm 2.8100 (2.6632) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:26:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][200/625] eta 0:03:47 lr 0.000102 wd 0.0500 time 0.5195 (0.5341) data time 0.0008 (0.0048) model time 0.5186 (0.5282) loss 6.0349 (6.9650) grad_norm 3.1151 (2.6744) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:26:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][210/625] eta 0:03:41 lr 0.000102 wd 0.0500 time 0.5820 (0.5340) data time 0.0012 (0.0046) model time 0.5809 (0.5282) loss 7.5896 (6.9664) grad_norm 2.9115 (2.6721) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:26:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][220/625] eta 0:03:36 lr 0.000102 wd 0.0500 time 0.5300 (0.5335) data time 0.0010 (0.0045) model time 0.5290 (0.5278) loss 6.6190 (6.9493) grad_norm 2.8389 (2.6630) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:26:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][230/625] eta 0:03:30 lr 0.000102 wd 0.0500 time 0.5615 (0.5333) data time 0.0010 (0.0044) model time 0.5605 (0.5278) loss 6.5133 (6.9593) grad_norm 1.9547 (2.6537) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:26:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][240/625] eta 0:03:25 lr 0.000102 wd 0.0500 time 0.5214 (0.5329) data time 0.0008 (0.0042) model time 0.5206 (0.5276) loss 7.3600 (6.9420) grad_norm 2.7879 (2.6492) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:26:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][250/625] eta 0:03:19 lr 0.000102 wd 0.0500 time 0.5176 (0.5325) data time 0.0007 (0.0041) model time 0.5169 (0.5273) loss 6.1026 (6.9444) grad_norm 2.2622 (2.6447) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:27:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][260/625] eta 0:03:14 lr 0.000102 wd 0.0500 time 0.5184 (0.5324) data time 0.0012 (0.0040) model time 0.5172 (0.5274) loss 6.9966 (6.9581) grad_norm 3.7197 (2.6407) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:27:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][270/625] eta 0:03:08 lr 0.000102 wd 0.0500 time 0.5240 (0.5323) data time 0.0014 (0.0039) model time 0.5226 (0.5274) loss 7.4957 (6.9584) grad_norm 2.1997 (2.6317) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:27:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][280/625] eta 0:03:03 lr 0.000101 wd 0.0500 time 0.5201 (0.5319) data time 0.0009 (0.0038) model time 0.5192 (0.5271) loss 6.6314 (6.9660) grad_norm 22.2699 (2.6922) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:27:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][290/625] eta 0:02:58 lr 0.000101 wd 0.0500 time 0.5214 (0.5317) data time 0.0011 (0.0037) model time 0.5202 (0.5270) loss 6.7263 (6.9684) grad_norm 2.0986 (2.6698) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:27:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][300/625] eta 0:02:53 lr 0.000101 wd 0.0500 time 0.5189 (0.5326) data time 0.0007 (0.0036) model time 0.5182 (0.5283) loss 6.5233 (6.9663) grad_norm 10.3722 (2.6795) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:27:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][310/625] eta 0:02:47 lr 0.000101 wd 0.0500 time 0.5348 (0.5331) data time 0.0012 (0.0035) model time 0.5336 (0.5290) loss 6.1067 (6.9583) grad_norm 2.9094 (2.6705) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:27:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][320/625] eta 0:02:42 lr 0.000101 wd 0.0500 time 0.5409 (0.5327) data time 0.0009 (0.0034) model time 0.5400 (0.5286) loss 7.5959 (6.9504) grad_norm 2.6435 (2.6704) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:27:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][330/625] eta 0:02:37 lr 0.000101 wd 0.0500 time 0.5193 (0.5324) data time 0.0010 (0.0034) model time 0.5184 (0.5284) loss 6.8311 (6.9571) grad_norm 1.9313 (2.6824) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:27:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-28 02:27:43 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 02:27:44 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 02:29:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-28 02:29:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-28 02:30:22 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-28 02:30:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-28 02:30:30 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-28 02:30:31 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-28 02:30:31 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-28 02:30:31 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 250) +[2024-07-28 02:30:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-28 02:30:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][340/625] eta 0:07:51 lr 0.000101 wd 0.0500 time 0.5675 (1.6540) data time 0.0007 (0.0810) model time 0.5668 (1.5729) loss 6.5720 (7.1790) grad_norm 2.5125 (2.4740) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:30:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][350/625] eta 0:04:58 lr 0.000101 wd 0.0500 time 0.5662 (1.0868) data time 0.0009 (0.0396) model time 0.5653 (1.0472) loss 6.8296 (7.1248) grad_norm 3.0318 (2.4310) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:31:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][360/625] eta 0:04:00 lr 0.000101 wd 0.0500 time 0.5699 (0.9090) data time 0.0007 (0.0263) model time 0.5692 (0.8828) loss 6.3823 (7.1391) grad_norm 3.3320 (2.4414) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:31:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][370/625] eta 0:03:29 lr 0.000101 wd 0.0500 time 0.5686 (0.8233) data time 0.0009 (0.0198) model time 0.5677 (0.8035) loss 7.6151 (7.0951) grad_norm 3.2557 (2.4743) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:31:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][380/625] eta 0:03:09 lr 0.000101 wd 0.0500 time 0.5727 (0.7723) data time 0.0010 (0.0160) model time 0.5716 (0.7563) loss 6.8540 (7.1179) grad_norm 2.1315 (2.5200) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:31:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][390/625] eta 0:02:55 lr 0.000101 wd 0.0500 time 0.5729 (0.7451) data time 0.0006 (0.0135) model time 0.5723 (0.7316) loss 6.2064 (7.0624) grad_norm 1.8568 (2.5062) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:31:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][400/625] eta 0:02:42 lr 0.000101 wd 0.0500 time 0.5748 (0.7206) data time 0.0009 (0.0116) model time 0.5739 (0.7089) loss 7.5543 (7.0349) grad_norm 2.0203 (2.5318) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:31:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][410/625] eta 0:02:30 lr 0.000101 wd 0.0500 time 0.5733 (0.7023) data time 0.0008 (0.0103) model time 0.5725 (0.6920) loss 6.9936 (7.0405) grad_norm 2.3594 (2.7973) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:31:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][420/625] eta 0:02:21 lr 0.000101 wd 0.0500 time 0.5799 (0.6882) data time 0.0006 (0.0092) model time 0.5793 (0.6790) loss 6.8928 (7.0447) grad_norm 2.2369 (2.7541) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:31:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][430/625] eta 0:02:12 lr 0.000101 wd 0.0500 time 0.5720 (0.6769) data time 0.0008 (0.0084) model time 0.5712 (0.6686) loss 7.2718 (7.0690) grad_norm 2.9360 (2.7233) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:31:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][440/625] eta 0:02:03 lr 0.000101 wd 0.0500 time 0.5737 (0.6677) data time 0.0006 (0.0077) model time 0.5731 (0.6600) loss 7.7466 (7.0853) grad_norm 4.0715 (2.6948) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:31:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][450/625] eta 0:01:55 lr 0.000101 wd 0.0500 time 0.5732 (0.6600) data time 0.0009 (0.0072) model time 0.5724 (0.6528) loss 7.1941 (7.0742) grad_norm 2.6627 (2.6742) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:32:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][460/625] eta 0:01:47 lr 0.000100 wd 0.0500 time 0.5748 (0.6537) data time 0.0007 (0.0067) model time 0.5741 (0.6470) loss 6.1919 (7.0516) grad_norm 2.0523 (2.6555) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:32:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][470/625] eta 0:01:40 lr 0.000100 wd 0.0500 time 0.5735 (0.6482) data time 0.0006 (0.0063) model time 0.5729 (0.6420) loss 8.1383 (7.0488) grad_norm 2.2407 (2.6385) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:32:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][480/625] eta 0:01:33 lr 0.000100 wd 0.0500 time 0.5764 (0.6436) data time 0.0007 (0.0059) model time 0.5757 (0.6377) loss 6.2748 (7.0255) grad_norm 2.3946 (2.7601) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:32:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][490/625] eta 0:01:26 lr 0.000100 wd 0.0500 time 0.5769 (0.6395) data time 0.0007 (0.0056) model time 0.5762 (0.6340) loss 7.6106 (7.0187) grad_norm 1.9878 (2.7631) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:32:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][500/625] eta 0:01:19 lr 0.000100 wd 0.0500 time 0.5728 (0.6361) data time 0.0009 (0.0053) model time 0.5720 (0.6307) loss 7.2380 (7.0386) grad_norm 2.2325 (2.7384) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:32:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][510/625] eta 0:01:12 lr 0.000100 wd 0.0500 time 0.5738 (0.6328) data time 0.0007 (0.0051) model time 0.5732 (0.6277) loss 7.0161 (7.0245) grad_norm 3.6599 (2.7318) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:32:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][520/625] eta 0:01:06 lr 0.000100 wd 0.0500 time 0.5772 (0.6299) data time 0.0006 (0.0049) model time 0.5766 (0.6251) loss 7.1386 (7.0314) grad_norm 2.6439 (2.7103) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:32:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][530/625] eta 0:00:59 lr 0.000100 wd 0.0500 time 0.5775 (0.6273) data time 0.0009 (0.0047) model time 0.5766 (0.6226) loss 6.1618 (7.0015) grad_norm 2.7501 (2.7042) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:32:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][540/625] eta 0:00:53 lr 0.000100 wd 0.0500 time 0.5764 (0.6250) data time 0.0006 (0.0045) model time 0.5758 (0.6205) loss 6.3615 (6.9896) grad_norm 1.9554 (2.6892) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:32:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][550/625] eta 0:00:46 lr 0.000100 wd 0.0500 time 0.5821 (0.6230) data time 0.0007 (0.0043) model time 0.5815 (0.6187) loss 7.4511 (6.9758) grad_norm 2.1290 (2.6680) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:32:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][560/625] eta 0:00:40 lr 0.000100 wd 0.0500 time 0.5768 (0.6212) data time 0.0006 (0.0042) model time 0.5762 (0.6170) loss 5.9738 (6.9937) grad_norm 2.0606 (2.6436) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:33:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][570/625] eta 0:00:34 lr 0.000100 wd 0.0500 time 0.5774 (0.6195) data time 0.0007 (0.0040) model time 0.5767 (0.6154) loss 5.4064 (6.9845) grad_norm 3.4037 (2.6390) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:33:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][580/625] eta 0:00:27 lr 0.000100 wd 0.0500 time 0.5769 (0.6179) data time 0.0007 (0.0039) model time 0.5762 (0.6140) loss 6.3444 (6.9821) grad_norm 2.2510 (2.6292) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:33:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][590/625] eta 0:00:21 lr 0.000100 wd 0.0500 time 0.5825 (0.6164) data time 0.0009 (0.0038) model time 0.5816 (0.6126) loss 8.3480 (6.9737) grad_norm 1.9098 (2.6399) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:33:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][600/625] eta 0:00:15 lr 0.000100 wd 0.0500 time 0.5771 (0.6149) data time 0.0007 (0.0037) model time 0.5764 (0.6113) loss 5.8356 (6.9581) grad_norm 2.6709 (2.6274) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:33:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][610/625] eta 0:00:09 lr 0.000100 wd 0.0500 time 0.5744 (0.6145) data time 0.0006 (0.0036) model time 0.5738 (0.6109) loss 7.3830 (6.9611) grad_norm 6.2388 (2.6941) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:33:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [250/300][620/625] eta 0:00:03 lr 0.000100 wd 0.0500 time 0.5754 (0.6133) data time 0.0006 (0.0035) model time 0.5748 (0.6098) loss 6.2261 (6.9573) grad_norm 3.4802 (2.7463) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-28 02:33:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 250 training takes 0:02:59 +[2024-07-28 02:33:35 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 02:33:40 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 02:33:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.459 (0.459) Loss 0.4944 (0.4944) Acc@1 90.137 (90.137) Acc@5 98.975 (98.975) Mem 22344MB +[2024-07-28 02:33:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.124 (0.155) Loss 0.7427 (0.5968) Acc@1 82.715 (87.997) Acc@5 96.875 (98.162) Mem 22344MB +[2024-07-28 02:33:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.141) Loss 0.8188 (0.6828) Acc@1 80.518 (85.447) Acc@5 96.338 (97.324) Mem 22344MB +[2024-07-28 02:33:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.057 Acc@5 97.327 +[2024-07-28 02:33:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.1% +[2024-07-28 02:33:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 85.06% +[2024-07-28 02:33:46 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-28 02:33:52 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-28 02:33:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.463 (0.463) Loss 0.5005 (0.5005) Acc@1 90.430 (90.430) Acc@5 99.023 (99.023) Mem 22344MB +[2024-07-28 02:33:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.155) Loss 0.7427 (0.6093) Acc@1 82.959 (88.081) Acc@5 96.875 (98.122) Mem 22344MB +[2024-07-28 02:33:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.124 (0.141) Loss 0.8374 (0.6945) Acc@1 80.615 (85.305) Acc@5 96.143 (97.347) Mem 22344MB +[2024-07-28 02:33:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.899 Acc@5 97.343 +[2024-07-28 02:33:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.9% +[2024-07-28 02:33:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.90% +[2024-07-28 02:33:55 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-28 02:34:00 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-28 02:34:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][0/625] eta 0:12:46 lr 0.000100 wd 0.0500 time 1.2264 (1.2264) data time 0.4312 (0.4312) model time 0.0000 (0.0000) loss 6.5185 (6.5185) grad_norm 2.1880 (2.1880) loss_scale 512.0000 (512.0000) mem 22337MB +[2024-07-28 02:34:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][10/625] eta 0:06:51 lr 0.000099 wd 0.0500 time 0.5749 (0.6688) data time 0.0008 (0.0401) model time 0.0000 (0.0000) loss 6.8258 (6.7257) grad_norm 1.8257 (2.2098) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:34:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][20/625] eta 0:06:17 lr 0.000099 wd 0.0500 time 0.5722 (0.6232) data time 0.0009 (0.0215) model time 0.0000 (0.0000) loss 7.4268 (6.8947) grad_norm 2.0693 (2.2874) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:34:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][30/625] eta 0:06:01 lr 0.000099 wd 0.0500 time 0.5757 (0.6069) data time 0.0007 (0.0149) model time 0.0000 (0.0000) loss 7.6882 (7.0394) grad_norm 3.2523 (2.3033) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:34:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][40/625] eta 0:05:50 lr 0.000099 wd 0.0500 time 0.5767 (0.5995) data time 0.0007 (0.0115) model time 0.0000 (0.0000) loss 6.1307 (7.0135) grad_norm 3.3187 (2.3780) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:34:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][50/625] eta 0:05:42 lr 0.000099 wd 0.0500 time 0.5735 (0.5948) data time 0.0009 (0.0094) model time 0.0000 (0.0000) loss 7.8317 (7.0405) grad_norm 2.0766 (2.6510) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:34:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][60/625] eta 0:05:34 lr 0.000099 wd 0.0500 time 0.5732 (0.5923) data time 0.0008 (0.0080) model time 0.5724 (0.5789) loss 7.3047 (7.0225) grad_norm 2.6010 (2.7808) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:34:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][70/625] eta 0:05:27 lr 0.000099 wd 0.0500 time 0.5944 (0.5903) data time 0.0007 (0.0070) model time 0.5936 (0.5777) loss 6.3357 (7.0125) grad_norm 3.1575 (2.8139) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:34:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][80/625] eta 0:05:20 lr 0.000099 wd 0.0500 time 0.5859 (0.5884) data time 0.0009 (0.0063) model time 0.5850 (0.5764) loss 7.7322 (7.0285) grad_norm 3.1429 (2.8164) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:34:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][90/625] eta 0:05:14 lr 0.000099 wd 0.0500 time 0.5973 (0.5874) data time 0.0007 (0.0057) model time 0.5967 (0.5769) loss 5.2372 (6.9603) grad_norm 1.8147 (2.7763) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:35:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][100/625] eta 0:05:07 lr 0.000099 wd 0.0500 time 0.5783 (0.5866) data time 0.0008 (0.0052) model time 0.5775 (0.5774) loss 7.6470 (6.9555) grad_norm 2.2784 (2.7613) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:35:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][110/625] eta 0:05:01 lr 0.000099 wd 0.0500 time 0.5745 (0.5858) data time 0.0008 (0.0048) model time 0.5737 (0.5771) loss 7.0641 (6.9683) grad_norm 2.2231 (2.8292) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:35:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][120/625] eta 0:04:55 lr 0.000099 wd 0.0500 time 0.5939 (0.5854) data time 0.0009 (0.0046) model time 0.5930 (0.5774) loss 6.4400 (6.9585) grad_norm 2.4758 (2.8046) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:35:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][130/625] eta 0:04:49 lr 0.000099 wd 0.0500 time 0.5834 (0.5847) data time 0.0007 (0.0043) model time 0.5827 (0.5771) loss 7.3111 (6.9428) grad_norm 2.0661 (2.9698) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:35:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][140/625] eta 0:04:43 lr 0.000099 wd 0.0500 time 0.5856 (0.5845) data time 0.0009 (0.0041) model time 0.5847 (0.5776) loss 7.4932 (6.9644) grad_norm 2.1619 (2.9471) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:35:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][150/625] eta 0:04:37 lr 0.000099 wd 0.0500 time 0.6245 (0.5844) data time 0.0008 (0.0039) model time 0.6237 (0.5781) loss 7.4428 (6.9634) grad_norm 1.7305 (2.9026) loss_scale 1024.0000 (522.1722) mem 22339MB +[2024-07-28 02:35:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][160/625] eta 0:04:31 lr 0.000099 wd 0.0500 time 0.5793 (0.5844) data time 0.0007 (0.0038) model time 0.5786 (0.5784) loss 7.9485 (6.9426) grad_norm 2.2700 (inf) loss_scale 512.0000 (537.4410) mem 22339MB +[2024-07-28 02:35:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][170/625] eta 0:04:25 lr 0.000099 wd 0.0500 time 0.5775 (0.5840) data time 0.0007 (0.0037) model time 0.5767 (0.5782) loss 7.7823 (6.9387) grad_norm 2.6617 (inf) loss_scale 512.0000 (535.9532) mem 22339MB +[2024-07-28 02:35:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][180/625] eta 0:04:19 lr 0.000099 wd 0.0500 time 0.5817 (0.5838) data time 0.0009 (0.0035) model time 0.5808 (0.5782) loss 7.4984 (6.9325) grad_norm 2.0630 (inf) loss_scale 512.0000 (534.6298) mem 22339MB +[2024-07-28 02:35:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][190/625] eta 0:04:13 lr 0.000098 wd 0.0500 time 0.5925 (0.5838) data time 0.0009 (0.0035) model time 0.5916 (0.5784) loss 6.0051 (6.9423) grad_norm 2.3470 (inf) loss_scale 512.0000 (533.4450) mem 22339MB +[2024-07-28 02:35:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][200/625] eta 0:04:07 lr 0.000098 wd 0.0500 time 0.5822 (0.5834) data time 0.0009 (0.0034) model time 0.5813 (0.5782) loss 6.8948 (6.9347) grad_norm 4.6245 (inf) loss_scale 512.0000 (532.3781) mem 22339MB +[2024-07-28 02:36:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][210/625] eta 0:04:02 lr 0.000098 wd 0.0500 time 0.5786 (0.5841) data time 0.0009 (0.0032) model time 0.5777 (0.5794) loss 7.8913 (6.9335) grad_norm 2.3229 (inf) loss_scale 512.0000 (531.4123) mem 22339MB +[2024-07-28 02:36:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][220/625] eta 0:03:56 lr 0.000098 wd 0.0500 time 0.6000 (0.5842) data time 0.0009 (0.0031) model time 0.5992 (0.5797) loss 7.9296 (6.9521) grad_norm 2.2667 (inf) loss_scale 512.0000 (530.5339) mem 22339MB +[2024-07-28 02:36:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][230/625] eta 0:03:50 lr 0.000098 wd 0.0500 time 0.5999 (0.5841) data time 0.0006 (0.0031) model time 0.5993 (0.5798) loss 5.6008 (6.9451) grad_norm 2.2093 (inf) loss_scale 512.0000 (529.7316) mem 22339MB +[2024-07-28 02:36:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][240/625] eta 0:03:44 lr 0.000098 wd 0.0500 time 0.5790 (0.5843) data time 0.0008 (0.0030) model time 0.5782 (0.5802) loss 7.0741 (6.9474) grad_norm 2.3299 (inf) loss_scale 512.0000 (528.9959) mem 22339MB +[2024-07-28 02:36:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][250/625] eta 0:03:39 lr 0.000098 wd 0.0500 time 0.5814 (0.5843) data time 0.0007 (0.0029) model time 0.5807 (0.5804) loss 6.5869 (6.9449) grad_norm 2.7993 (inf) loss_scale 512.0000 (528.3187) mem 22339MB +[2024-07-28 02:36:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][260/625] eta 0:03:33 lr 0.000098 wd 0.0500 time 0.6326 (0.5843) data time 0.0007 (0.0028) model time 0.6319 (0.5805) loss 6.6859 (6.9405) grad_norm 2.0837 (inf) loss_scale 512.0000 (527.6935) mem 22339MB +[2024-07-28 02:36:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][270/625] eta 0:03:27 lr 0.000098 wd 0.0500 time 0.5765 (0.5840) data time 0.0008 (0.0027) model time 0.5757 (0.5803) loss 7.2555 (6.9494) grad_norm 2.0692 (inf) loss_scale 512.0000 (527.1144) mem 22339MB +[2024-07-28 02:36:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][280/625] eta 0:03:21 lr 0.000098 wd 0.0500 time 0.5763 (0.5839) data time 0.0006 (0.0027) model time 0.5757 (0.5803) loss 7.2145 (6.9519) grad_norm 6.9086 (inf) loss_scale 512.0000 (526.5765) mem 22339MB +[2024-07-28 02:36:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][290/625] eta 0:03:15 lr 0.000098 wd 0.0500 time 0.5769 (0.5840) data time 0.0007 (0.0026) model time 0.5763 (0.5805) loss 5.6298 (6.9601) grad_norm 2.2178 (inf) loss_scale 512.0000 (526.0756) mem 22339MB +[2024-07-28 02:36:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][300/625] eta 0:03:09 lr 0.000098 wd 0.0500 time 0.6695 (0.5840) data time 0.0007 (0.0026) model time 0.6688 (0.5806) loss 6.9416 (6.9575) grad_norm 2.7418 (inf) loss_scale 512.0000 (525.6080) mem 22339MB +[2024-07-28 02:37:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][310/625] eta 0:03:03 lr 0.000098 wd 0.0500 time 0.5725 (0.5838) data time 0.0008 (0.0025) model time 0.5717 (0.5804) loss 7.0689 (6.9491) grad_norm 1.4564 (inf) loss_scale 512.0000 (525.1704) mem 22339MB +[2024-07-28 02:37:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][320/625] eta 0:02:58 lr 0.000098 wd 0.0500 time 0.5770 (0.5840) data time 0.0007 (0.0025) model time 0.5763 (0.5809) loss 6.9520 (6.9523) grad_norm 2.6928 (inf) loss_scale 512.0000 (524.7601) mem 22339MB +[2024-07-28 02:37:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][330/625] eta 0:02:52 lr 0.000098 wd 0.0500 time 0.5932 (0.5839) data time 0.0008 (0.0024) model time 0.5925 (0.5808) loss 7.2070 (6.9568) grad_norm 2.8759 (inf) loss_scale 512.0000 (524.3746) mem 22339MB +[2024-07-28 02:37:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][340/625] eta 0:02:46 lr 0.000098 wd 0.0500 time 0.5681 (0.5840) data time 0.0010 (0.0026) model time 0.5672 (0.5808) loss 6.4015 (6.9629) grad_norm 3.2103 (inf) loss_scale 512.0000 (524.0117) mem 22339MB +[2024-07-28 02:37:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][350/625] eta 0:02:40 lr 0.000098 wd 0.0500 time 0.5746 (0.5843) data time 0.0007 (0.0025) model time 0.5739 (0.5812) loss 7.0999 (6.9537) grad_norm 2.2426 (inf) loss_scale 512.0000 (523.6695) mem 22339MB +[2024-07-28 02:37:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][360/625] eta 0:02:34 lr 0.000098 wd 0.0500 time 0.5766 (0.5842) data time 0.0006 (0.0025) model time 0.5760 (0.5812) loss 6.9246 (6.9465) grad_norm 3.0674 (inf) loss_scale 512.0000 (523.3463) mem 22339MB +[2024-07-28 02:37:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][370/625] eta 0:02:28 lr 0.000097 wd 0.0500 time 0.5765 (0.5842) data time 0.0012 (0.0024) model time 0.5752 (0.5812) loss 7.7455 (6.9384) grad_norm 2.7429 (inf) loss_scale 512.0000 (523.0404) mem 22339MB +[2024-07-28 02:37:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][380/625] eta 0:02:23 lr 0.000097 wd 0.0500 time 0.5781 (0.5841) data time 0.0006 (0.0024) model time 0.5775 (0.5811) loss 7.2663 (6.9427) grad_norm 3.7122 (inf) loss_scale 512.0000 (522.7507) mem 22339MB +[2024-07-28 02:37:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][390/625] eta 0:02:17 lr 0.000097 wd 0.0500 time 0.6139 (0.5840) data time 0.0006 (0.0024) model time 0.6133 (0.5810) loss 7.7797 (6.9394) grad_norm 3.3647 (inf) loss_scale 512.0000 (522.4757) mem 22339MB +[2024-07-28 02:37:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][400/625] eta 0:02:11 lr 0.000097 wd 0.0500 time 0.5938 (0.5840) data time 0.0008 (0.0023) model time 0.5930 (0.5811) loss 7.0022 (6.9366) grad_norm 10.9222 (inf) loss_scale 512.0000 (522.2145) mem 22339MB +[2024-07-28 02:38:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][410/625] eta 0:02:05 lr 0.000097 wd 0.0500 time 0.5799 (0.5838) data time 0.0010 (0.0023) model time 0.5789 (0.5809) loss 7.5324 (6.9294) grad_norm 3.5031 (inf) loss_scale 512.0000 (521.9659) mem 22339MB +[2024-07-28 02:38:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][420/625] eta 0:01:59 lr 0.000097 wd 0.0500 time 0.5776 (0.5837) data time 0.0007 (0.0023) model time 0.5770 (0.5808) loss 6.0331 (6.9178) grad_norm 3.1256 (inf) loss_scale 512.0000 (521.7292) mem 22339MB +[2024-07-28 02:38:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][430/625] eta 0:01:53 lr 0.000097 wd 0.0500 time 0.5842 (0.5840) data time 0.0006 (0.0022) model time 0.5836 (0.5813) loss 7.2559 (6.9225) grad_norm 1.9562 (inf) loss_scale 512.0000 (521.5035) mem 22339MB +[2024-07-28 02:38:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][440/625] eta 0:01:48 lr 0.000097 wd 0.0500 time 0.6223 (0.5839) data time 0.0008 (0.0022) model time 0.6215 (0.5812) loss 7.8025 (6.9292) grad_norm 3.0311 (inf) loss_scale 512.0000 (521.2880) mem 22339MB +[2024-07-28 02:38:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][450/625] eta 0:01:42 lr 0.000097 wd 0.0500 time 0.5850 (0.5838) data time 0.0006 (0.0022) model time 0.5844 (0.5811) loss 5.9563 (6.9348) grad_norm 2.1459 (inf) loss_scale 512.0000 (521.0820) mem 22339MB +[2024-07-28 02:38:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][460/625] eta 0:01:36 lr 0.000097 wd 0.0500 time 0.5828 (0.5836) data time 0.0009 (0.0022) model time 0.5819 (0.5810) loss 5.7084 (6.9322) grad_norm 2.0718 (inf) loss_scale 512.0000 (520.8850) mem 22339MB +[2024-07-28 02:38:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][470/625] eta 0:01:30 lr 0.000097 wd 0.0500 time 0.5794 (0.5835) data time 0.0008 (0.0021) model time 0.5786 (0.5809) loss 6.8314 (6.9337) grad_norm 3.6971 (inf) loss_scale 512.0000 (520.6964) mem 22339MB +[2024-07-28 02:38:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][480/625] eta 0:01:24 lr 0.000097 wd 0.0500 time 0.5758 (0.5833) data time 0.0009 (0.0021) model time 0.5749 (0.5807) loss 7.3111 (6.9383) grad_norm 2.8469 (inf) loss_scale 512.0000 (520.5156) mem 22339MB +[2024-07-28 02:38:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][490/625] eta 0:01:18 lr 0.000097 wd 0.0500 time 0.5834 (0.5832) data time 0.0006 (0.0021) model time 0.5828 (0.5806) loss 6.1091 (6.9419) grad_norm 2.3631 (inf) loss_scale 512.0000 (520.3422) mem 22339MB +[2024-07-28 02:38:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][500/625] eta 0:01:12 lr 0.000097 wd 0.0500 time 0.5767 (0.5831) data time 0.0009 (0.0021) model time 0.5758 (0.5805) loss 7.1719 (6.9398) grad_norm 2.8303 (inf) loss_scale 512.0000 (520.1756) mem 22339MB +[2024-07-28 02:38:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][510/625] eta 0:01:07 lr 0.000097 wd 0.0500 time 0.5946 (0.5829) data time 0.0008 (0.0020) model time 0.5938 (0.5804) loss 6.2302 (6.9349) grad_norm 2.3333 (inf) loss_scale 512.0000 (520.0157) mem 22339MB +[2024-07-28 02:39:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][520/625] eta 0:01:01 lr 0.000097 wd 0.0500 time 0.5835 (0.5828) data time 0.0008 (0.0020) model time 0.5827 (0.5803) loss 8.3736 (6.9352) grad_norm 1.9744 (inf) loss_scale 512.0000 (519.8618) mem 22339MB +[2024-07-28 02:39:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][530/625] eta 0:00:55 lr 0.000097 wd 0.0500 time 0.5732 (0.5827) data time 0.0008 (0.0020) model time 0.5724 (0.5802) loss 7.6315 (6.9359) grad_norm 2.4581 (inf) loss_scale 512.0000 (519.7137) mem 22339MB +[2024-07-28 02:39:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][540/625] eta 0:00:49 lr 0.000097 wd 0.0500 time 0.5788 (0.5829) data time 0.0007 (0.0020) model time 0.5781 (0.5804) loss 6.8309 (6.9299) grad_norm 1.7558 (inf) loss_scale 512.0000 (519.5712) mem 22339MB +[2024-07-28 02:39:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][550/625] eta 0:00:43 lr 0.000096 wd 0.0500 time 0.5784 (0.5828) data time 0.0007 (0.0020) model time 0.5777 (0.5803) loss 6.5542 (6.9269) grad_norm 1.8849 (inf) loss_scale 512.0000 (519.4338) mem 22339MB +[2024-07-28 02:39:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][560/625] eta 0:00:37 lr 0.000096 wd 0.0500 time 0.5765 (0.5826) data time 0.0008 (0.0019) model time 0.5758 (0.5802) loss 7.7154 (6.9250) grad_norm 3.2452 (inf) loss_scale 512.0000 (519.3012) mem 22339MB +[2024-07-28 02:39:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][570/625] eta 0:00:32 lr 0.000096 wd 0.0500 time 0.5791 (0.5825) data time 0.0008 (0.0019) model time 0.5783 (0.5801) loss 7.6408 (6.9239) grad_norm 2.2659 (inf) loss_scale 512.0000 (519.1734) mem 22339MB +[2024-07-28 02:39:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][580/625] eta 0:00:26 lr 0.000096 wd 0.0500 time 0.5775 (0.5824) data time 0.0010 (0.0019) model time 0.5765 (0.5800) loss 8.5713 (6.9295) grad_norm 2.1447 (inf) loss_scale 512.0000 (519.0499) mem 22339MB +[2024-07-28 02:39:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][590/625] eta 0:00:20 lr 0.000096 wd 0.0500 time 0.5772 (0.5823) data time 0.0007 (0.0019) model time 0.5765 (0.5799) loss 7.3721 (6.9286) grad_norm 1.8271 (inf) loss_scale 512.0000 (518.9306) mem 22339MB +[2024-07-28 02:39:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][600/625] eta 0:00:14 lr 0.000096 wd 0.0500 time 0.5761 (0.5822) data time 0.0008 (0.0019) model time 0.5753 (0.5798) loss 6.8926 (6.9295) grad_norm 1.9201 (inf) loss_scale 512.0000 (518.8153) mem 22339MB +[2024-07-28 02:39:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][610/625] eta 0:00:08 lr 0.000096 wd 0.0500 time 0.5825 (0.5821) data time 0.0005 (0.0019) model time 0.5820 (0.5798) loss 5.7975 (6.9296) grad_norm 1.8485 (inf) loss_scale 512.0000 (518.7038) mem 22339MB +[2024-07-28 02:40:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [251/300][620/625] eta 0:00:02 lr 0.000096 wd 0.0500 time 0.5790 (0.5820) data time 0.0004 (0.0018) model time 0.5786 (0.5797) loss 5.9853 (6.9276) grad_norm 4.7482 (inf) loss_scale 512.0000 (518.5958) mem 22339MB +[2024-07-28 02:40:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 251 training takes 0:06:03 +[2024-07-28 02:40:04 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 02:40:06 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 02:40:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.470 (0.470) Loss 0.4922 (0.4922) Acc@1 90.674 (90.674) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-28 02:40:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.7485 (0.6019) Acc@1 83.008 (87.997) Acc@5 97.168 (98.149) Mem 22339MB +[2024-07-28 02:40:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8315 (0.6896) Acc@1 80.518 (85.363) Acc@5 96.240 (97.359) Mem 22339MB +[2024-07-28 02:40:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.017 Acc@5 97.357 +[2024-07-28 02:40:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.0% +[2024-07-28 02:40:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.883 (0.883) Loss 0.5000 (0.5000) Acc@1 90.430 (90.430) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-28 02:40:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.124 (0.195) Loss 0.7427 (0.6088) Acc@1 83.008 (88.081) Acc@5 96.924 (98.122) Mem 22339MB +[2024-07-28 02:40:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.162) Loss 0.8374 (0.6940) Acc@1 80.566 (85.314) Acc@5 96.191 (97.354) Mem 22339MB +[2024-07-28 02:40:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.907 Acc@5 97.347 +[2024-07-28 02:40:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.9% +[2024-07-28 02:40:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.91% +[2024-07-28 02:40:13 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-28 02:40:14 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-28 02:40:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][0/625] eta 0:09:06 lr 0.000096 wd 0.0500 time 0.8737 (0.8737) data time 0.3590 (0.3590) model time 0.0000 (0.0000) loss 7.3727 (7.3727) grad_norm 2.5374 (2.5374) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:40:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][10/625] eta 0:06:10 lr 0.000096 wd 0.0500 time 0.5755 (0.6022) data time 0.0006 (0.0335) model time 0.0000 (0.0000) loss 7.8999 (7.1400) grad_norm 14.5462 (3.8585) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:40:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][20/625] eta 0:06:05 lr 0.000096 wd 0.0500 time 0.7494 (0.6045) data time 0.0009 (0.0180) model time 0.0000 (0.0000) loss 6.1979 (6.9855) grad_norm 1.8323 (3.1483) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:40:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][30/625] eta 0:05:54 lr 0.000096 wd 0.0500 time 0.5706 (0.5953) data time 0.0006 (0.0125) model time 0.0000 (0.0000) loss 6.0242 (6.8640) grad_norm 3.5332 (2.9707) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:40:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][40/625] eta 0:05:45 lr 0.000096 wd 0.0500 time 0.5751 (0.5906) data time 0.0006 (0.0096) model time 0.0000 (0.0000) loss 5.6696 (6.8170) grad_norm 2.3600 (3.0779) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:40:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][50/625] eta 0:05:38 lr 0.000096 wd 0.0500 time 0.5732 (0.5887) data time 0.0010 (0.0079) model time 0.0000 (0.0000) loss 8.4729 (6.9332) grad_norm 2.0454 (2.9764) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:40:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][60/625] eta 0:05:31 lr 0.000096 wd 0.0500 time 0.5733 (0.5865) data time 0.0009 (0.0068) model time 0.5724 (0.5744) loss 6.3056 (6.9205) grad_norm 2.7270 (2.8559) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:40:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][70/625] eta 0:05:24 lr 0.000096 wd 0.0500 time 0.5756 (0.5851) data time 0.0007 (0.0060) model time 0.5749 (0.5749) loss 8.1572 (6.9591) grad_norm 3.0244 (2.8122) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:41:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][80/625] eta 0:05:19 lr 0.000096 wd 0.0500 time 0.5723 (0.5857) data time 0.0008 (0.0054) model time 0.5714 (0.5796) loss 7.2290 (6.9585) grad_norm 5.2376 (2.7885) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:41:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][90/625] eta 0:05:12 lr 0.000096 wd 0.0500 time 0.5736 (0.5848) data time 0.0007 (0.0049) model time 0.5729 (0.5788) loss 7.3173 (6.9811) grad_norm 3.0302 (2.7381) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:41:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][100/625] eta 0:05:06 lr 0.000096 wd 0.0500 time 0.5735 (0.5845) data time 0.0006 (0.0045) model time 0.5729 (0.5793) loss 7.2048 (6.9902) grad_norm 2.6080 (2.7337) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-28 02:41:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][110/625] eta 0:05:00 lr 0.000095 wd 0.0500 time 0.5671 (0.5837) data time 0.0008 (0.0042) model time 0.5663 (0.5785) loss 7.4315 (6.9504) grad_norm inf (inf) loss_scale 256.0000 (509.6937) mem 22339MB +[2024-07-28 02:41:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][120/625] eta 0:04:54 lr 0.000095 wd 0.0500 time 0.5672 (0.5834) data time 0.0008 (0.0039) model time 0.5663 (0.5786) loss 7.7906 (6.9619) grad_norm 2.3506 (inf) loss_scale 256.0000 (488.7273) mem 22339MB +[2024-07-28 02:41:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][130/625] eta 0:04:48 lr 0.000095 wd 0.0500 time 0.5756 (0.5833) data time 0.0009 (0.0037) model time 0.5747 (0.5789) loss 7.9977 (6.9674) grad_norm 2.8783 (inf) loss_scale 256.0000 (470.9618) mem 22339MB +[2024-07-28 02:41:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][140/625] eta 0:04:42 lr 0.000095 wd 0.0500 time 0.5774 (0.5830) data time 0.0009 (0.0035) model time 0.5765 (0.5788) loss 7.1265 (7.0033) grad_norm 2.8464 (inf) loss_scale 256.0000 (455.7163) mem 22339MB +[2024-07-28 02:41:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][150/625] eta 0:04:37 lr 0.000095 wd 0.0500 time 0.5717 (0.5833) data time 0.0009 (0.0033) model time 0.5708 (0.5796) loss 5.8437 (6.9820) grad_norm 2.7292 (inf) loss_scale 256.0000 (442.4901) mem 22339MB +[2024-07-28 02:41:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][160/625] eta 0:04:31 lr 0.000095 wd 0.0500 time 0.5668 (0.5831) data time 0.0006 (0.0032) model time 0.5662 (0.5796) loss 7.5631 (6.9834) grad_norm 5.6030 (inf) loss_scale 256.0000 (430.9068) mem 22339MB +[2024-07-28 02:41:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][170/625] eta 0:04:25 lr 0.000095 wd 0.0500 time 0.5738 (0.5827) data time 0.0006 (0.0030) model time 0.5732 (0.5792) loss 7.8213 (6.9674) grad_norm 2.1113 (inf) loss_scale 256.0000 (420.6784) mem 22339MB +[2024-07-28 02:42:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][180/625] eta 0:04:19 lr 0.000095 wd 0.0500 time 0.5728 (0.5824) data time 0.0007 (0.0029) model time 0.5722 (0.5790) loss 6.5166 (6.9589) grad_norm 2.2884 (inf) loss_scale 256.0000 (411.5801) mem 22339MB +[2024-07-28 02:42:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][190/625] eta 0:04:13 lr 0.000095 wd 0.0500 time 0.5723 (0.5821) data time 0.0006 (0.0028) model time 0.5717 (0.5787) loss 5.3270 (6.9576) grad_norm 2.8421 (inf) loss_scale 256.0000 (403.4346) mem 22339MB +[2024-07-28 02:42:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][200/625] eta 0:04:07 lr 0.000095 wd 0.0500 time 0.5705 (0.5824) data time 0.0008 (0.0027) model time 0.5697 (0.5793) loss 7.4826 (6.9832) grad_norm 3.1814 (inf) loss_scale 256.0000 (396.0995) mem 22339MB +[2024-07-28 02:42:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][210/625] eta 0:04:01 lr 0.000095 wd 0.0500 time 0.5687 (0.5825) data time 0.0008 (0.0026) model time 0.5679 (0.5796) loss 7.8750 (6.9848) grad_norm 2.0636 (inf) loss_scale 256.0000 (389.4597) mem 22339MB +[2024-07-28 02:42:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][220/625] eta 0:03:55 lr 0.000095 wd 0.0500 time 0.5731 (0.5823) data time 0.0007 (0.0026) model time 0.5724 (0.5793) loss 6.8340 (6.9790) grad_norm 2.2038 (inf) loss_scale 256.0000 (383.4208) mem 22339MB +[2024-07-28 02:42:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][230/625] eta 0:03:49 lr 0.000095 wd 0.0500 time 0.5726 (0.5821) data time 0.0008 (0.0025) model time 0.5717 (0.5792) loss 6.1764 (6.9773) grad_norm 3.1660 (inf) loss_scale 256.0000 (377.9048) mem 22339MB +[2024-07-28 02:42:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][240/625] eta 0:03:44 lr 0.000095 wd 0.0500 time 0.5693 (0.5837) data time 0.0008 (0.0025) model time 0.5685 (0.5813) loss 7.3008 (6.9839) grad_norm 2.5919 (inf) loss_scale 256.0000 (372.8465) mem 22339MB +[2024-07-28 02:42:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][250/625] eta 0:03:38 lr 0.000095 wd 0.0500 time 0.5700 (0.5834) data time 0.0010 (0.0024) model time 0.5691 (0.5810) loss 7.6105 (6.9768) grad_norm 1.7467 (inf) loss_scale 256.0000 (368.1912) mem 22339MB +[2024-07-28 02:42:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][260/625] eta 0:03:33 lr 0.000095 wd 0.0500 time 0.5724 (0.5837) data time 0.0008 (0.0023) model time 0.5716 (0.5815) loss 6.9286 (6.9845) grad_norm 2.4785 (inf) loss_scale 256.0000 (363.8927) mem 22339MB +[2024-07-28 02:42:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][270/625] eta 0:03:27 lr 0.000095 wd 0.0500 time 0.5693 (0.5837) data time 0.0008 (0.0023) model time 0.5686 (0.5815) loss 7.5230 (6.9983) grad_norm 2.9595 (inf) loss_scale 256.0000 (359.9114) mem 22339MB +[2024-07-28 02:42:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][280/625] eta 0:03:21 lr 0.000095 wd 0.0500 time 0.5724 (0.5834) data time 0.0008 (0.0023) model time 0.5716 (0.5813) loss 7.0363 (6.9903) grad_norm 1.8588 (inf) loss_scale 256.0000 (356.2135) mem 22339MB +[2024-07-28 02:43:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][290/625] eta 0:03:15 lr 0.000095 wd 0.0500 time 0.5185 (0.5842) data time 0.0008 (0.0023) model time 0.5178 (0.5822) loss 6.7534 (6.9850) grad_norm 2.0725 (inf) loss_scale 256.0000 (352.7698) mem 22339MB +[2024-07-28 02:43:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][300/625] eta 0:03:09 lr 0.000094 wd 0.0500 time 0.5728 (0.5843) data time 0.0008 (0.0022) model time 0.5720 (0.5824) loss 5.9983 (6.9760) grad_norm 1.9533 (inf) loss_scale 256.0000 (349.5548) mem 22339MB +[2024-07-28 02:43:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][310/625] eta 0:03:04 lr 0.000094 wd 0.0500 time 0.5700 (0.5845) data time 0.0006 (0.0022) model time 0.5693 (0.5826) loss 6.9643 (6.9724) grad_norm 2.3869 (inf) loss_scale 256.0000 (346.5466) mem 22339MB +[2024-07-28 02:43:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][320/625] eta 0:02:58 lr 0.000094 wd 0.0500 time 0.5749 (0.5844) data time 0.0010 (0.0022) model time 0.5739 (0.5826) loss 7.0856 (6.9755) grad_norm 2.3820 (inf) loss_scale 256.0000 (343.7259) mem 22339MB +[2024-07-28 02:43:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][330/625] eta 0:02:52 lr 0.000094 wd 0.0500 time 0.5747 (0.5841) data time 0.0008 (0.0021) model time 0.5738 (0.5822) loss 5.6968 (6.9695) grad_norm 2.8652 (inf) loss_scale 256.0000 (341.0755) mem 22339MB +[2024-07-28 02:43:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][340/625] eta 0:02:46 lr 0.000094 wd 0.0500 time 0.5678 (0.5843) data time 0.0008 (0.0021) model time 0.5669 (0.5824) loss 8.3601 (6.9701) grad_norm 7.9812 (inf) loss_scale 256.0000 (338.5806) mem 22339MB +[2024-07-28 02:43:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][350/625] eta 0:02:40 lr 0.000094 wd 0.0500 time 0.5720 (0.5840) data time 0.0008 (0.0020) model time 0.5712 (0.5821) loss 7.8850 (6.9810) grad_norm 2.5646 (inf) loss_scale 256.0000 (336.2279) mem 22339MB +[2024-07-28 02:43:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][360/625] eta 0:02:34 lr 0.000094 wd 0.0500 time 0.5660 (0.5840) data time 0.0006 (0.0021) model time 0.5654 (0.5821) loss 6.8167 (6.9839) grad_norm 1.7928 (inf) loss_scale 256.0000 (334.0055) mem 22339MB +[2024-07-28 02:43:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][370/625] eta 0:02:29 lr 0.000094 wd 0.0500 time 0.5711 (0.5843) data time 0.0009 (0.0020) model time 0.5702 (0.5826) loss 6.8586 (6.9920) grad_norm 3.7331 (inf) loss_scale 256.0000 (331.9030) mem 22339MB +[2024-07-28 02:43:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][380/625] eta 0:02:23 lr 0.000094 wd 0.0500 time 0.5652 (0.5843) data time 0.0009 (0.0020) model time 0.5643 (0.5825) loss 6.6768 (6.9880) grad_norm 2.9708 (inf) loss_scale 256.0000 (329.9108) mem 22339MB +[2024-07-28 02:44:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][390/625] eta 0:02:17 lr 0.000094 wd 0.0500 time 0.5709 (0.5841) data time 0.0007 (0.0020) model time 0.5702 (0.5823) loss 5.9368 (6.9872) grad_norm 2.6661 (inf) loss_scale 256.0000 (328.0205) mem 22339MB +[2024-07-28 02:44:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][400/625] eta 0:02:11 lr 0.000094 wd 0.0500 time 0.5767 (0.5847) data time 0.0010 (0.0020) model time 0.5757 (0.5830) loss 6.7977 (6.9822) grad_norm 3.6466 (inf) loss_scale 256.0000 (326.2244) mem 22339MB +[2024-07-28 02:44:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][410/625] eta 0:02:05 lr 0.000094 wd 0.0500 time 0.5688 (0.5845) data time 0.0010 (0.0020) model time 0.5678 (0.5828) loss 6.7196 (6.9796) grad_norm 2.8800 (inf) loss_scale 128.0000 (322.3358) mem 22339MB +[2024-07-28 02:44:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][420/625] eta 0:01:59 lr 0.000094 wd 0.0500 time 0.5757 (0.5844) data time 0.0006 (0.0020) model time 0.5751 (0.5827) loss 7.8079 (6.9807) grad_norm 1.9805 (inf) loss_scale 128.0000 (317.7197) mem 22339MB +[2024-07-28 02:44:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][430/625] eta 0:01:53 lr 0.000094 wd 0.0500 time 0.5785 (0.5843) data time 0.0006 (0.0019) model time 0.5779 (0.5826) loss 7.0721 (6.9745) grad_norm 4.5583 (inf) loss_scale 128.0000 (313.3179) mem 22339MB +[2024-07-28 02:44:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][440/625] eta 0:01:48 lr 0.000094 wd 0.0500 time 0.5738 (0.5843) data time 0.0008 (0.0019) model time 0.5729 (0.5825) loss 6.0103 (6.9776) grad_norm 1.9695 (inf) loss_scale 128.0000 (309.1156) mem 22339MB +[2024-07-28 02:44:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][450/625] eta 0:01:42 lr 0.000094 wd 0.0500 time 0.5756 (0.5841) data time 0.0008 (0.0019) model time 0.5748 (0.5824) loss 7.1082 (6.9918) grad_norm 2.0723 (inf) loss_scale 128.0000 (305.0998) mem 22339MB +[2024-07-28 02:44:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][460/625] eta 0:01:36 lr 0.000094 wd 0.0500 time 0.7830 (0.5847) data time 0.0008 (0.0019) model time 0.7822 (0.5830) loss 7.8978 (6.9971) grad_norm 3.0643 (inf) loss_scale 128.0000 (301.2581) mem 22339MB +[2024-07-28 02:44:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][470/625] eta 0:01:30 lr 0.000094 wd 0.0500 time 0.5710 (0.5845) data time 0.0006 (0.0018) model time 0.5704 (0.5829) loss 7.3200 (6.9998) grad_norm 2.1416 (inf) loss_scale 128.0000 (297.5796) mem 22339MB +[2024-07-28 02:44:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][480/625] eta 0:01:24 lr 0.000093 wd 0.0500 time 0.5728 (0.5844) data time 0.0007 (0.0018) model time 0.5721 (0.5828) loss 6.4729 (6.9963) grad_norm 1.9692 (inf) loss_scale 128.0000 (294.0541) mem 22339MB +[2024-07-28 02:45:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][490/625] eta 0:01:18 lr 0.000093 wd 0.0500 time 0.5761 (0.5842) data time 0.0006 (0.0018) model time 0.5755 (0.5826) loss 6.4300 (6.9969) grad_norm 2.6168 (inf) loss_scale 128.0000 (290.6721) mem 22339MB +[2024-07-28 02:45:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][500/625] eta 0:01:13 lr 0.000093 wd 0.0500 time 0.5752 (0.5840) data time 0.0008 (0.0018) model time 0.5744 (0.5824) loss 7.3199 (6.9971) grad_norm 2.7934 (inf) loss_scale 128.0000 (287.4251) mem 22339MB +[2024-07-28 02:45:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][510/625] eta 0:01:07 lr 0.000093 wd 0.0500 time 0.5748 (0.5839) data time 0.0007 (0.0018) model time 0.5741 (0.5823) loss 6.2311 (6.9981) grad_norm 3.8680 (inf) loss_scale 128.0000 (284.3053) mem 22339MB +[2024-07-28 02:45:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][520/625] eta 0:01:01 lr 0.000093 wd 0.0500 time 0.5764 (0.5840) data time 0.0008 (0.0018) model time 0.5756 (0.5825) loss 5.8537 (6.9936) grad_norm 3.0866 (inf) loss_scale 128.0000 (281.3052) mem 22339MB +[2024-07-28 02:45:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][530/625] eta 0:00:55 lr 0.000093 wd 0.0500 time 0.5714 (0.5839) data time 0.0007 (0.0017) model time 0.5708 (0.5823) loss 7.1683 (6.9959) grad_norm 2.3619 (inf) loss_scale 128.0000 (278.4181) mem 22339MB +[2024-07-28 02:45:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][540/625] eta 0:00:49 lr 0.000093 wd 0.0500 time 0.5660 (0.5837) data time 0.0008 (0.0017) model time 0.5651 (0.5821) loss 6.9204 (6.9980) grad_norm 1.7473 (inf) loss_scale 128.0000 (275.6377) mem 22339MB +[2024-07-28 02:45:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][550/625] eta 0:00:43 lr 0.000093 wd 0.0500 time 0.5672 (0.5836) data time 0.0008 (0.0017) model time 0.5664 (0.5820) loss 8.1006 (7.0028) grad_norm 2.0015 (inf) loss_scale 128.0000 (272.9583) mem 22339MB +[2024-07-28 02:45:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][560/625] eta 0:00:37 lr 0.000093 wd 0.0500 time 0.5708 (0.5835) data time 0.0006 (0.0017) model time 0.5701 (0.5819) loss 6.4634 (6.9936) grad_norm 1.7282 (inf) loss_scale 128.0000 (270.3743) mem 22339MB +[2024-07-28 02:45:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][570/625] eta 0:00:32 lr 0.000093 wd 0.0500 time 0.5744 (0.5833) data time 0.0006 (0.0017) model time 0.5738 (0.5817) loss 7.6926 (6.9863) grad_norm 1.6431 (inf) loss_scale 128.0000 (267.8809) mem 22339MB +[2024-07-28 02:45:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][580/625] eta 0:00:26 lr 0.000093 wd 0.0500 time 0.5763 (0.5832) data time 0.0006 (0.0017) model time 0.5756 (0.5816) loss 7.7957 (6.9813) grad_norm 3.7208 (inf) loss_scale 128.0000 (265.4733) mem 22339MB +[2024-07-28 02:45:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][590/625] eta 0:00:20 lr 0.000093 wd 0.0500 time 0.5758 (0.5831) data time 0.0008 (0.0017) model time 0.5750 (0.5815) loss 6.2960 (6.9869) grad_norm 3.1442 (inf) loss_scale 128.0000 (263.1472) mem 22339MB +[2024-07-28 02:46:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][600/625] eta 0:00:14 lr 0.000093 wd 0.0500 time 0.5743 (0.5830) data time 0.0009 (0.0016) model time 0.5734 (0.5814) loss 7.0418 (6.9886) grad_norm 2.2263 (inf) loss_scale 128.0000 (260.8985) mem 22339MB +[2024-07-28 02:46:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][610/625] eta 0:00:08 lr 0.000093 wd 0.0500 time 0.5731 (0.5829) data time 0.0006 (0.0016) model time 0.5724 (0.5813) loss 6.1868 (6.9869) grad_norm 2.0648 (inf) loss_scale 128.0000 (258.7234) mem 22339MB +[2024-07-28 02:46:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [252/300][620/625] eta 0:00:02 lr 0.000093 wd 0.0500 time 0.5754 (0.5828) data time 0.0006 (0.0016) model time 0.5748 (0.5812) loss 6.8580 (6.9840) grad_norm 2.1570 (inf) loss_scale 128.0000 (256.6184) mem 22339MB +[2024-07-28 02:46:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 252 training takes 0:06:04 +[2024-07-28 02:46:19 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 02:46:23 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 02:46:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.466 (0.466) Loss 0.5044 (0.5044) Acc@1 90.234 (90.234) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-28 02:46:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.124 (0.156) Loss 0.7520 (0.6072) Acc@1 83.203 (87.971) Acc@5 97.217 (98.189) Mem 22339MB +[2024-07-28 02:46:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.124 (0.141) Loss 0.8276 (0.6924) Acc@1 81.055 (85.377) Acc@5 96.240 (97.377) Mem 22339MB +[2024-07-28 02:46:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.013 Acc@5 97.377 +[2024-07-28 02:46:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.0% +[2024-07-28 02:46:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.899 (0.899) Loss 0.5005 (0.5005) Acc@1 90.479 (90.479) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-28 02:46:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.196) Loss 0.7412 (0.6085) Acc@1 82.910 (88.117) Acc@5 96.973 (98.122) Mem 22339MB +[2024-07-28 02:46:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.162) Loss 0.8359 (0.6934) Acc@1 80.469 (85.335) Acc@5 96.240 (97.347) Mem 22339MB +[2024-07-28 02:46:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.921 Acc@5 97.347 +[2024-07-28 02:46:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.9% +[2024-07-28 02:46:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.92% +[2024-07-28 02:46:30 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-28 02:46:32 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-28 02:46:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][0/625] eta 0:09:20 lr 0.000093 wd 0.0500 time 0.8969 (0.8969) data time 0.3825 (0.3825) model time 0.0000 (0.0000) loss 6.4773 (6.4773) grad_norm 2.8260 (2.8260) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 02:46:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][10/625] eta 0:06:11 lr 0.000093 wd 0.0500 time 0.5647 (0.6045) data time 0.0006 (0.0357) model time 0.0000 (0.0000) loss 6.7560 (6.6542) grad_norm 7.6134 (3.3347) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 02:46:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][20/625] eta 0:05:57 lr 0.000093 wd 0.0500 time 0.5757 (0.5908) data time 0.0009 (0.0191) model time 0.0000 (0.0000) loss 7.6245 (6.7017) grad_norm 2.1671 (2.9734) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 02:46:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-28 02:46:48 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 02:46:50 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 02:49:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-28 02:49:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-28 02:50:18 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-28 02:50:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-28 02:50:29 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-28 02:50:29 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-28 02:50:29 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-28 02:50:29 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 253) +[2024-07-28 02:50:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-28 02:54:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-28 02:54:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-28 02:55:02 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-28 02:55:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-28 02:55:13 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-28 02:55:13 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-28 02:55:13 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-28 02:55:13 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 253) +[2024-07-28 02:55:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-28 02:55:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][30/625] eta 0:29:42 lr 0.000093 wd 0.0500 time 0.5649 (2.9953) data time 0.0005 (0.1470) model time 0.0000 (0.0000) loss 6.9571 (7.2204) grad_norm 2.1517 (2.1768) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 02:55:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][40/625] eta 0:12:18 lr 0.000092 wd 0.0500 time 0.5681 (1.2622) data time 0.0006 (0.0427) model time 0.0000 (0.0000) loss 6.8198 (7.0558) grad_norm 1.9161 (2.3291) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 02:55:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][50/625] eta 0:09:20 lr 0.000092 wd 0.0500 time 0.6211 (0.9749) data time 0.0008 (0.0253) model time 0.0000 (0.0000) loss 6.2002 (6.9769) grad_norm 2.0006 (2.3095) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 02:55:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][60/625] eta 0:08:05 lr 0.000092 wd 0.0500 time 0.5975 (0.8586) data time 0.0007 (0.0181) model time 0.5969 (0.5787) loss 5.9311 (7.0470) grad_norm 5.8664 (2.3720) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 02:55:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][70/625] eta 0:07:20 lr 0.000092 wd 0.0500 time 0.5687 (0.7937) data time 0.0006 (0.0145) model time 0.5681 (0.5748) loss 6.4614 (6.9808) grad_norm 1.8684 (2.3590) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 02:55:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][80/625] eta 0:06:51 lr 0.000092 wd 0.0500 time 0.5676 (0.7556) data time 0.0006 (0.0120) model time 0.5670 (0.5788) loss 7.1338 (6.9430) grad_norm 1.8652 (2.3835) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 02:56:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][90/625] eta 0:06:30 lr 0.000092 wd 0.0500 time 0.5679 (0.7299) data time 0.0006 (0.0103) model time 0.5673 (0.5816) loss 6.7268 (6.9055) grad_norm 1.7912 (2.3442) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 02:56:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][100/625] eta 0:06:12 lr 0.000092 wd 0.0500 time 0.5715 (0.7088) data time 0.0007 (0.0090) model time 0.5709 (0.5799) loss 7.8849 (6.9237) grad_norm 2.3533 (2.3797) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 02:56:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][110/625] eta 0:05:57 lr 0.000092 wd 0.0500 time 0.6154 (0.6934) data time 0.0008 (0.0082) model time 0.6146 (0.5794) loss 7.7025 (6.9090) grad_norm 1.6566 (2.3923) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 02:56:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][120/625] eta 0:05:44 lr 0.000092 wd 0.0500 time 0.5750 (0.6812) data time 0.0008 (0.0076) model time 0.5742 (0.5790) loss 6.7812 (6.9285) grad_norm 2.4728 (2.4737) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 02:56:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][130/625] eta 0:05:32 lr 0.000092 wd 0.0500 time 0.5970 (0.6709) data time 0.0008 (0.0069) model time 0.5962 (0.5783) loss 6.6610 (6.9637) grad_norm 1.8899 (2.4578) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 02:56:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][140/625] eta 0:05:21 lr 0.000092 wd 0.0500 time 0.5743 (0.6627) data time 0.0009 (0.0064) model time 0.5734 (0.5781) loss 6.8684 (6.9583) grad_norm 2.6367 (2.4529) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 02:56:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][150/625] eta 0:05:11 lr 0.000092 wd 0.0500 time 0.5719 (0.6565) data time 0.0008 (0.0061) model time 0.5711 (0.5787) loss 6.2932 (6.9733) grad_norm 2.4159 (2.4482) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 02:56:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][160/625] eta 0:05:02 lr 0.000092 wd 0.0500 time 0.5684 (0.6505) data time 0.0008 (0.0057) model time 0.5676 (0.5783) loss 8.1020 (6.9922) grad_norm 1.9449 (2.4406) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 02:56:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-28 02:56:48 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 02:56:54 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 02:58:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-28 02:58:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-28 03:01:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-28 03:01:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-28 03:01:28 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-28 03:01:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-28 03:01:45 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-28 03:01:46 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-28 03:01:46 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-28 03:01:46 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 253) +[2024-07-28 03:01:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-28 03:02:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][170/625] eta 0:21:46 lr 0.000092 wd 0.0500 time 0.5761 (2.8706) data time 0.0007 (0.1469) model time 0.5754 (2.7237) loss 8.4545 (7.5165) grad_norm 3.9299 (3.1643) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:02:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][180/625] eta 0:09:07 lr 0.000092 wd 0.0500 time 0.5758 (1.2304) data time 0.0006 (0.0426) model time 0.5752 (1.1878) loss 6.9599 (7.0921) grad_norm 2.1934 (4.7557) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:02:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][190/625] eta 0:06:56 lr 0.000092 wd 0.0500 time 0.5688 (0.9568) data time 0.0009 (0.0252) model time 0.5679 (0.9315) loss 6.5762 (7.0680) grad_norm 2.0478 (3.7622) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:02:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][200/625] eta 0:05:59 lr 0.000092 wd 0.0500 time 0.5712 (0.8457) data time 0.0007 (0.0181) model time 0.5705 (0.8276) loss 5.5818 (7.0510) grad_norm 3.2954 (3.7037) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:02:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][210/625] eta 0:05:26 lr 0.000092 wd 0.0500 time 0.5742 (0.7861) data time 0.0007 (0.0142) model time 0.5735 (0.7720) loss 6.0347 (7.0640) grad_norm 1.9370 (3.5149) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:02:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][220/625] eta 0:05:03 lr 0.000092 wd 0.0500 time 0.5913 (0.7502) data time 0.0007 (0.0121) model time 0.5906 (0.7381) loss 7.3153 (7.0418) grad_norm 2.7367 (3.4864) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:02:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][230/625] eta 0:04:46 lr 0.000091 wd 0.0500 time 0.5747 (0.7264) data time 0.0007 (0.0103) model time 0.5739 (0.7161) loss 6.7037 (7.0087) grad_norm 3.5881 (3.6547) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:02:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][240/625] eta 0:04:31 lr 0.000091 wd 0.0500 time 0.5764 (0.7062) data time 0.0007 (0.0090) model time 0.5758 (0.6972) loss 6.6475 (7.0010) grad_norm 2.4456 (3.4834) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:02:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][250/625] eta 0:04:19 lr 0.000091 wd 0.0500 time 0.6032 (0.6910) data time 0.0008 (0.0081) model time 0.6024 (0.6830) loss 7.6532 (7.0003) grad_norm 2.2392 (3.3607) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:02:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][260/625] eta 0:04:08 lr 0.000091 wd 0.0500 time 0.5776 (0.6796) data time 0.0008 (0.0075) model time 0.5768 (0.6721) loss 6.8733 (6.9935) grad_norm 2.8693 (3.2607) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:02:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][270/625] eta 0:03:58 lr 0.000091 wd 0.0500 time 0.6328 (0.6708) data time 0.0008 (0.0070) model time 0.6320 (0.6638) loss 6.1425 (7.0076) grad_norm 2.8964 (3.1925) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:03:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][280/625] eta 0:03:48 lr 0.000091 wd 0.0500 time 0.5879 (0.6624) data time 0.0009 (0.0064) model time 0.5871 (0.6560) loss 7.5020 (7.0290) grad_norm 3.2136 (3.1393) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:03:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][290/625] eta 0:03:39 lr 0.000091 wd 0.0500 time 0.5663 (0.6553) data time 0.0009 (0.0060) model time 0.5654 (0.6493) loss 6.6709 (7.0346) grad_norm 1.7460 (3.0862) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:03:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][300/625] eta 0:03:31 lr 0.000091 wd 0.0500 time 0.5880 (0.6500) data time 0.0008 (0.0057) model time 0.5872 (0.6443) loss 7.9383 (7.0349) grad_norm 2.8577 (3.0682) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:03:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][310/625] eta 0:03:23 lr 0.000091 wd 0.0500 time 0.6619 (0.6456) data time 0.0007 (0.0053) model time 0.6612 (0.6402) loss 6.9406 (7.0274) grad_norm 2.1767 (3.2024) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:03:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][320/625] eta 0:03:15 lr 0.000091 wd 0.0500 time 0.5760 (0.6412) data time 0.0007 (0.0051) model time 0.5753 (0.6361) loss 6.0372 (7.0137) grad_norm 2.5535 (3.1534) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:03:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-28 03:03:31 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 03:03:37 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 03:05:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-28 03:05:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-28 03:05:59 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-28 03:06:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-28 03:06:13 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-28 03:06:14 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-28 03:06:14 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-28 03:06:14 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 253) +[2024-07-28 03:06:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-28 03:06:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][330/625] eta 0:13:11 lr 0.000091 wd 0.0500 time 0.5995 (2.6842) data time 0.0010 (0.1097) model time 0.5985 (2.5745) loss 8.7058 (7.5762) grad_norm 2.2716 (2.4813) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:06:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][340/625] eta 0:06:33 lr 0.000091 wd 0.0500 time 0.6001 (1.3813) data time 0.0010 (0.0418) model time 0.5991 (1.3395) loss 7.2273 (7.2418) grad_norm 2.6220 (2.6776) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:06:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][350/625] eta 0:04:57 lr 0.000091 wd 0.0500 time 0.6067 (1.0802) data time 0.0009 (0.0262) model time 0.6059 (1.0540) loss 7.4091 (7.1716) grad_norm 3.2169 (2.9275) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:06:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][360/625] eta 0:04:10 lr 0.000091 wd 0.0500 time 0.5969 (0.9459) data time 0.0011 (0.0192) model time 0.5958 (0.9267) loss 7.8894 (7.2138) grad_norm 1.7064 (2.8374) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:06:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][370/625] eta 0:03:41 lr 0.000091 wd 0.0500 time 0.6040 (0.8700) data time 0.0011 (0.0153) model time 0.6029 (0.8547) loss 6.7774 (7.1970) grad_norm 2.5927 (2.7561) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:07:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][380/625] eta 0:03:22 lr 0.000091 wd 0.0500 time 0.5975 (0.8283) data time 0.0010 (0.0127) model time 0.5965 (0.8156) loss 8.2065 (7.1536) grad_norm 1.8933 (2.6885) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:07:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][390/625] eta 0:03:06 lr 0.000091 wd 0.0500 time 0.6010 (0.7939) data time 0.0010 (0.0110) model time 0.6000 (0.7830) loss 6.2014 (7.0868) grad_norm 2.4386 (2.6480) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:07:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][400/625] eta 0:02:52 lr 0.000091 wd 0.0500 time 0.6041 (0.7689) data time 0.0010 (0.0097) model time 0.6031 (0.7592) loss 7.7627 (7.0484) grad_norm 2.5466 (2.7843) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:07:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][410/625] eta 0:02:41 lr 0.000091 wd 0.0500 time 0.6007 (0.7497) data time 0.0009 (0.0087) model time 0.5998 (0.7410) loss 5.9671 (6.9859) grad_norm 2.5185 (2.7838) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:07:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][420/625] eta 0:02:30 lr 0.000090 wd 0.0500 time 0.6001 (0.7342) data time 0.0008 (0.0079) model time 0.5994 (0.7263) loss 6.8417 (6.9992) grad_norm 1.9564 (2.9342) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:07:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][430/625] eta 0:02:20 lr 0.000090 wd 0.0500 time 0.5987 (0.7214) data time 0.0010 (0.0072) model time 0.5977 (0.7142) loss 8.1054 (7.0396) grad_norm 2.5940 (2.9149) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:07:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][440/625] eta 0:02:11 lr 0.000090 wd 0.0500 time 0.6022 (0.7109) data time 0.0008 (0.0067) model time 0.6014 (0.7042) loss 7.5921 (7.0549) grad_norm 4.2013 (2.8758) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:07:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][450/625] eta 0:02:02 lr 0.000090 wd 0.0500 time 0.5998 (0.7021) data time 0.0008 (0.0062) model time 0.5990 (0.6958) loss 6.6057 (7.0684) grad_norm 2.1637 (2.8453) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:07:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][460/625] eta 0:01:54 lr 0.000090 wd 0.0500 time 0.6038 (0.6945) data time 0.0010 (0.0059) model time 0.6027 (0.6887) loss 6.0173 (7.0752) grad_norm 2.7123 (2.8117) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:07:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][470/625] eta 0:01:46 lr 0.000090 wd 0.0500 time 0.6069 (0.6884) data time 0.0008 (0.0055) model time 0.6060 (0.6828) loss 6.2196 (7.0441) grad_norm 9.6650 (2.8597) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:08:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][480/625] eta 0:01:39 lr 0.000090 wd 0.0500 time 0.6026 (0.6830) data time 0.0010 (0.0052) model time 0.6016 (0.6778) loss 7.6772 (7.0355) grad_norm 2.0596 (2.8565) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:08:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][490/625] eta 0:01:31 lr 0.000090 wd 0.0500 time 0.6028 (0.6783) data time 0.0010 (0.0050) model time 0.6018 (0.6733) loss 7.9210 (7.0384) grad_norm 2.8431 (2.8605) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:08:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][500/625] eta 0:01:24 lr 0.000090 wd 0.0500 time 0.6058 (0.6739) data time 0.0009 (0.0048) model time 0.6050 (0.6692) loss 6.3714 (7.0239) grad_norm 2.9029 (2.8292) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:08:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][510/625] eta 0:01:17 lr 0.000090 wd 0.0500 time 0.6004 (0.6699) data time 0.0011 (0.0046) model time 0.5993 (0.6654) loss 6.0550 (7.0181) grad_norm 2.5349 (2.7891) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:08:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][520/625] eta 0:01:09 lr 0.000090 wd 0.0500 time 0.6099 (0.6665) data time 0.0008 (0.0044) model time 0.6091 (0.6622) loss 6.3726 (6.9990) grad_norm 1.6184 (2.7517) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:08:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][530/625] eta 0:01:03 lr 0.000090 wd 0.0500 time 0.6072 (0.6637) data time 0.0009 (0.0042) model time 0.6064 (0.6595) loss 5.7766 (6.9843) grad_norm 1.9962 (2.7338) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:08:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][540/625] eta 0:00:56 lr 0.000090 wd 0.0500 time 0.6479 (0.6616) data time 0.0008 (0.0041) model time 0.6471 (0.6575) loss 5.3434 (6.9699) grad_norm 1.8600 (2.7118) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:08:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][550/625] eta 0:00:49 lr 0.000090 wd 0.0500 time 0.6106 (0.6592) data time 0.0009 (0.0040) model time 0.6097 (0.6551) loss 6.0782 (6.9822) grad_norm 3.9226 (2.7363) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:08:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-28 03:08:51 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 03:08:55 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 03:10:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-28 03:10:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-28 03:11:09 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-28 03:11:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-28 03:11:19 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-28 03:11:19 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-28 03:11:19 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-28 03:11:19 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 253) +[2024-07-28 03:11:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-28 03:11:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][560/625] eta 0:03:07 lr 0.000090 wd 0.0500 time 0.5754 (2.8881) data time 0.0006 (0.1523) model time 0.5748 (2.7358) loss 7.0182 (7.3435) grad_norm 9.7735 (3.8953) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:11:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][570/625] eta 0:01:07 lr 0.000090 wd 0.0500 time 0.5724 (1.2332) data time 0.0006 (0.0441) model time 0.5718 (1.1891) loss 6.9143 (7.1171) grad_norm 2.3769 (2.8036) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:11:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][580/625] eta 0:00:43 lr 0.000090 wd 0.0500 time 0.5712 (0.9575) data time 0.0009 (0.0261) model time 0.5703 (0.9314) loss 7.2763 (7.1775) grad_norm 4.3153 (3.0423) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:11:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][590/625] eta 0:00:29 lr 0.000090 wd 0.0500 time 0.5678 (0.8431) data time 0.0007 (0.0187) model time 0.5672 (0.8244) loss 6.4462 (7.1849) grad_norm 2.2257 (2.9993) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:11:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][600/625] eta 0:00:19 lr 0.000090 wd 0.0500 time 0.5664 (0.7806) data time 0.0007 (0.0146) model time 0.5657 (0.7660) loss 6.3328 (7.1140) grad_norm 2.4575 (2.9369) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:12:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][610/625] eta 0:00:11 lr 0.000089 wd 0.0500 time 0.5689 (0.7437) data time 0.0004 (0.0121) model time 0.5685 (0.7315) loss 7.8943 (7.0901) grad_norm 3.0996 (2.9180) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:12:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [253/300][620/625] eta 0:00:03 lr 0.000089 wd 0.0500 time 0.5723 (0.7197) data time 0.0004 (0.0103) model time 0.5719 (0.7094) loss 6.2893 (7.0446) grad_norm 2.9628 (2.9101) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-28 03:12:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 253 training takes 0:00:48 +[2024-07-28 03:12:11 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 03:12:17 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 03:12:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.468 (0.468) Loss 0.5020 (0.5020) Acc@1 90.381 (90.381) Acc@5 99.023 (99.023) Mem 22344MB +[2024-07-28 03:12:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.124 (0.156) Loss 0.7451 (0.6038) Acc@1 83.350 (88.126) Acc@5 97.070 (98.127) Mem 22344MB +[2024-07-28 03:12:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.141) Loss 0.8252 (0.6887) Acc@1 80.273 (85.484) Acc@5 96.436 (97.354) Mem 22344MB +[2024-07-28 03:12:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.083 Acc@5 97.359 +[2024-07-28 03:12:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.1% +[2024-07-28 03:12:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 85.08% +[2024-07-28 03:12:22 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-28 03:12:24 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-28 03:12:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.458 (0.458) Loss 0.5000 (0.5000) Acc@1 90.381 (90.381) Acc@5 98.975 (98.975) Mem 22344MB +[2024-07-28 03:12:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.124 (0.155) Loss 0.7417 (0.6082) Acc@1 83.008 (88.113) Acc@5 97.021 (98.140) Mem 22344MB +[2024-07-28 03:12:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.124 (0.140) Loss 0.8350 (0.6930) Acc@1 80.566 (85.345) Acc@5 96.240 (97.349) Mem 22344MB +[2024-07-28 03:12:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.939 Acc@5 97.347 +[2024-07-28 03:12:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 84.9% +[2024-07-28 03:12:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.94% +[2024-07-28 03:12:28 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-28 03:12:29 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-28 03:12:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][0/625] eta 0:10:27 lr 0.000089 wd 0.0500 time 1.0048 (1.0048) data time 0.3492 (0.3492) model time 0.0000 (0.0000) loss 7.1734 (7.1734) grad_norm 2.6094 (2.6094) loss_scale 128.0000 (128.0000) mem 22337MB +[2024-07-28 03:12:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][10/625] eta 0:06:16 lr 0.000089 wd 0.0500 time 0.5702 (0.6121) data time 0.0008 (0.0325) model time 0.0000 (0.0000) loss 7.4443 (7.1418) grad_norm 2.1413 (2.6426) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:12:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][20/625] eta 0:05:58 lr 0.000089 wd 0.0500 time 0.5702 (0.5927) data time 0.0007 (0.0174) model time 0.0000 (0.0000) loss 7.0683 (7.0301) grad_norm 2.8459 (2.6449) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:12:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][30/625] eta 0:05:48 lr 0.000089 wd 0.0500 time 0.5669 (0.5859) data time 0.0009 (0.0121) model time 0.0000 (0.0000) loss 7.0254 (7.0577) grad_norm 2.5453 (2.7437) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:12:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][40/625] eta 0:05:40 lr 0.000089 wd 0.0500 time 0.5703 (0.5822) data time 0.0006 (0.0093) model time 0.0000 (0.0000) loss 6.2737 (7.0193) grad_norm 2.4078 (2.7930) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:12:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][50/625] eta 0:05:33 lr 0.000089 wd 0.0500 time 0.5684 (0.5800) data time 0.0009 (0.0077) model time 0.0000 (0.0000) loss 8.4049 (7.0209) grad_norm 3.0269 (2.7161) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:13:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][60/625] eta 0:05:26 lr 0.000089 wd 0.0500 time 0.5686 (0.5787) data time 0.0007 (0.0066) model time 0.5679 (0.5709) loss 6.5402 (6.9797) grad_norm 3.6043 (2.6844) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:13:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][70/625] eta 0:05:20 lr 0.000089 wd 0.0500 time 0.5720 (0.5780) data time 0.0006 (0.0058) model time 0.5713 (0.5718) loss 7.3283 (6.9617) grad_norm 3.3614 (2.6321) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:13:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][80/625] eta 0:05:14 lr 0.000089 wd 0.0500 time 0.5701 (0.5774) data time 0.0006 (0.0052) model time 0.5694 (0.5721) loss 6.0212 (6.9267) grad_norm 23.7616 (2.9114) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:13:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][90/625] eta 0:05:08 lr 0.000089 wd 0.0500 time 0.5655 (0.5771) data time 0.0006 (0.0047) model time 0.5649 (0.5724) loss 6.9577 (6.9283) grad_norm 3.9323 (2.8539) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:13:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][100/625] eta 0:05:02 lr 0.000089 wd 0.0500 time 0.5698 (0.5767) data time 0.0008 (0.0043) model time 0.5690 (0.5725) loss 7.1050 (6.9523) grad_norm 2.8106 (2.7950) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:13:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][110/625] eta 0:04:56 lr 0.000089 wd 0.0500 time 0.5687 (0.5763) data time 0.0006 (0.0040) model time 0.5681 (0.5723) loss 7.0751 (6.9490) grad_norm 1.9477 (2.7528) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:13:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][120/625] eta 0:04:51 lr 0.000089 wd 0.0500 time 0.5686 (0.5765) data time 0.0006 (0.0037) model time 0.5680 (0.5731) loss 7.4804 (6.9337) grad_norm 2.3614 (2.7137) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:13:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][130/625] eta 0:04:45 lr 0.000089 wd 0.0500 time 0.5690 (0.5762) data time 0.0007 (0.0035) model time 0.5683 (0.5729) loss 6.6520 (6.8926) grad_norm 1.8512 (2.6658) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:13:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][140/625] eta 0:04:39 lr 0.000089 wd 0.0500 time 0.5739 (0.5760) data time 0.0007 (0.0033) model time 0.5732 (0.5729) loss 6.4021 (6.8775) grad_norm 1.9440 (2.6482) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:13:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][150/625] eta 0:04:33 lr 0.000089 wd 0.0500 time 0.5725 (0.5759) data time 0.0006 (0.0032) model time 0.5719 (0.5730) loss 8.2444 (6.8741) grad_norm 2.6747 (2.6354) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:14:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][160/625] eta 0:04:27 lr 0.000089 wd 0.0500 time 0.5690 (0.5759) data time 0.0006 (0.0030) model time 0.5684 (0.5732) loss 6.1149 (6.8841) grad_norm 4.4487 (2.6368) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:14:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][170/625] eta 0:04:21 lr 0.000088 wd 0.0500 time 0.5730 (0.5758) data time 0.0006 (0.0029) model time 0.5724 (0.5731) loss 5.9767 (6.8895) grad_norm 1.5889 (2.6158) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:14:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][180/625] eta 0:04:16 lr 0.000088 wd 0.0500 time 0.5657 (0.5756) data time 0.0007 (0.0028) model time 0.5650 (0.5731) loss 6.7419 (6.8779) grad_norm 2.1406 (2.6081) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:14:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][190/625] eta 0:04:10 lr 0.000088 wd 0.0500 time 0.5761 (0.5756) data time 0.0008 (0.0027) model time 0.5753 (0.5732) loss 7.0172 (6.8654) grad_norm 2.9337 (2.5977) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:14:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][200/625] eta 0:04:04 lr 0.000088 wd 0.0500 time 0.5695 (0.5757) data time 0.0006 (0.0026) model time 0.5689 (0.5734) loss 6.5876 (6.8543) grad_norm 2.3717 (2.6006) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:14:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][210/625] eta 0:03:59 lr 0.000088 wd 0.0500 time 0.5678 (0.5765) data time 0.0009 (0.0025) model time 0.5669 (0.5745) loss 7.8111 (6.8632) grad_norm 2.2350 (2.6028) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:14:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][220/625] eta 0:03:53 lr 0.000088 wd 0.0500 time 0.5717 (0.5764) data time 0.0009 (0.0024) model time 0.5708 (0.5745) loss 6.9903 (6.8680) grad_norm 2.2311 (2.5917) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:14:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][230/625] eta 0:03:47 lr 0.000088 wd 0.0500 time 0.5745 (0.5764) data time 0.0009 (0.0024) model time 0.5736 (0.5745) loss 8.1416 (6.8547) grad_norm 3.5995 (2.5920) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:14:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][240/625] eta 0:03:41 lr 0.000088 wd 0.0500 time 0.5707 (0.5764) data time 0.0009 (0.0023) model time 0.5698 (0.5745) loss 8.3387 (6.8574) grad_norm 2.5440 (2.5809) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:14:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][250/625] eta 0:03:36 lr 0.000088 wd 0.0500 time 0.5711 (0.5762) data time 0.0007 (0.0022) model time 0.5705 (0.5744) loss 7.6720 (6.8855) grad_norm 2.4331 (2.5692) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:15:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][260/625] eta 0:03:30 lr 0.000088 wd 0.0500 time 0.5696 (0.5764) data time 0.0008 (0.0022) model time 0.5688 (0.5747) loss 5.0118 (6.8845) grad_norm 16.3330 (2.6118) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:15:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][270/625] eta 0:03:24 lr 0.000088 wd 0.0500 time 0.5675 (0.5768) data time 0.0009 (0.0021) model time 0.5667 (0.5753) loss 6.8383 (6.8826) grad_norm 2.9162 (2.6020) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:15:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][280/625] eta 0:03:19 lr 0.000088 wd 0.0500 time 0.5718 (0.5769) data time 0.0006 (0.0021) model time 0.5712 (0.5753) loss 6.9271 (6.8945) grad_norm 2.3128 (2.7579) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:15:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][290/625] eta 0:03:13 lr 0.000088 wd 0.0500 time 0.5710 (0.5771) data time 0.0006 (0.0021) model time 0.5703 (0.5755) loss 6.6784 (6.8940) grad_norm 2.4103 (2.7916) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:15:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][300/625] eta 0:03:07 lr 0.000088 wd 0.0500 time 0.5695 (0.5773) data time 0.0008 (0.0020) model time 0.5687 (0.5758) loss 8.0695 (6.9030) grad_norm 2.4890 (2.7913) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:15:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][310/625] eta 0:03:01 lr 0.000088 wd 0.0500 time 0.5713 (0.5773) data time 0.0007 (0.0020) model time 0.5705 (0.5758) loss 6.2914 (6.8978) grad_norm 2.4772 (2.8072) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:15:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][320/625] eta 0:02:56 lr 0.000088 wd 0.0500 time 0.5701 (0.5775) data time 0.0006 (0.0020) model time 0.5695 (0.5761) loss 6.5062 (6.8886) grad_norm 61.6632 (3.0214) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:15:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][330/625] eta 0:02:50 lr 0.000088 wd 0.0500 time 0.5722 (0.5775) data time 0.0007 (0.0019) model time 0.5716 (0.5761) loss 6.8933 (6.8936) grad_norm 2.3999 (3.0112) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:15:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][340/625] eta 0:02:44 lr 0.000088 wd 0.0500 time 0.5672 (0.5777) data time 0.0008 (0.0019) model time 0.5664 (0.5764) loss 7.4881 (6.8992) grad_norm 2.5305 (2.9894) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:15:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][350/625] eta 0:02:38 lr 0.000088 wd 0.0500 time 0.5718 (0.5776) data time 0.0007 (0.0019) model time 0.5711 (0.5763) loss 6.8722 (6.9024) grad_norm 2.4469 (2.9826) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:15:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][360/625] eta 0:02:33 lr 0.000087 wd 0.0500 time 0.5727 (0.5780) data time 0.0007 (0.0019) model time 0.5720 (0.5767) loss 6.0467 (6.9065) grad_norm 2.8865 (2.9809) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:16:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][370/625] eta 0:02:27 lr 0.000087 wd 0.0500 time 0.5697 (0.5780) data time 0.0009 (0.0018) model time 0.5688 (0.5768) loss 6.2124 (6.9078) grad_norm 1.9689 (2.9635) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:16:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][380/625] eta 0:02:21 lr 0.000087 wd 0.0500 time 0.5715 (0.5781) data time 0.0009 (0.0018) model time 0.5706 (0.5769) loss 7.4529 (6.9083) grad_norm 2.2987 (2.9591) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:16:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][390/625] eta 0:02:15 lr 0.000087 wd 0.0500 time 0.5717 (0.5780) data time 0.0008 (0.0018) model time 0.5709 (0.5768) loss 6.8006 (6.9011) grad_norm 4.1120 (3.0056) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:16:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][400/625] eta 0:02:10 lr 0.000087 wd 0.0500 time 0.5735 (0.5779) data time 0.0007 (0.0018) model time 0.5729 (0.5767) loss 5.6741 (6.8915) grad_norm 1.9862 (2.9938) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:16:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][410/625] eta 0:02:04 lr 0.000087 wd 0.0500 time 0.5699 (0.5778) data time 0.0009 (0.0017) model time 0.5691 (0.5766) loss 7.7766 (6.8954) grad_norm 2.9483 (2.9885) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:16:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][420/625] eta 0:01:58 lr 0.000087 wd 0.0500 time 0.5677 (0.5778) data time 0.0008 (0.0017) model time 0.5669 (0.5767) loss 6.5321 (6.8921) grad_norm 2.2066 (3.0121) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:16:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][430/625] eta 0:01:52 lr 0.000087 wd 0.0500 time 0.5678 (0.5782) data time 0.0007 (0.0017) model time 0.5672 (0.5771) loss 6.6544 (6.8892) grad_norm 13.1125 (3.0245) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:16:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][440/625] eta 0:01:46 lr 0.000087 wd 0.0500 time 0.5706 (0.5781) data time 0.0007 (0.0017) model time 0.5700 (0.5770) loss 8.0102 (6.8962) grad_norm 2.2695 (3.0254) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:16:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][450/625] eta 0:01:41 lr 0.000087 wd 0.0500 time 0.5679 (0.5781) data time 0.0008 (0.0017) model time 0.5670 (0.5769) loss 6.2493 (6.9017) grad_norm 2.5134 (3.0260) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:16:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][460/625] eta 0:01:35 lr 0.000087 wd 0.0500 time 0.5698 (0.5780) data time 0.0006 (0.0016) model time 0.5691 (0.5769) loss 7.7170 (6.8921) grad_norm 2.7554 (3.0286) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:17:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][470/625] eta 0:01:29 lr 0.000087 wd 0.0500 time 0.5656 (0.5779) data time 0.0009 (0.0016) model time 0.5647 (0.5768) loss 7.7913 (6.8964) grad_norm 2.3918 (3.0153) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:17:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][480/625] eta 0:01:23 lr 0.000087 wd 0.0500 time 0.5684 (0.5779) data time 0.0007 (0.0016) model time 0.5677 (0.5768) loss 7.2892 (6.8936) grad_norm 2.1143 (3.0084) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:17:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][490/625] eta 0:01:18 lr 0.000087 wd 0.0500 time 0.5690 (0.5779) data time 0.0007 (0.0016) model time 0.5682 (0.5768) loss 7.9871 (6.9007) grad_norm 2.2546 (3.0049) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:17:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][500/625] eta 0:01:12 lr 0.000087 wd 0.0500 time 0.5693 (0.5779) data time 0.0009 (0.0016) model time 0.5684 (0.5768) loss 6.9561 (6.9070) grad_norm 1.6601 (2.9893) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:17:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][510/625] eta 0:01:06 lr 0.000087 wd 0.0500 time 0.5686 (0.5779) data time 0.0007 (0.0016) model time 0.5679 (0.5767) loss 7.9458 (6.9101) grad_norm 2.6304 (2.9763) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:17:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][520/625] eta 0:01:00 lr 0.000087 wd 0.0500 time 0.5693 (0.5778) data time 0.0009 (0.0016) model time 0.5683 (0.5766) loss 5.7947 (6.9085) grad_norm 2.2275 (2.9744) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:17:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][530/625] eta 0:00:54 lr 0.000087 wd 0.0500 time 0.5698 (0.5780) data time 0.0008 (0.0015) model time 0.5690 (0.5769) loss 6.9365 (6.9152) grad_norm 2.8479 (2.9738) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:17:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][540/625] eta 0:00:49 lr 0.000087 wd 0.0500 time 0.5709 (0.5779) data time 0.0006 (0.0015) model time 0.5702 (0.5768) loss 6.9645 (6.9133) grad_norm 2.1630 (2.9697) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:17:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][550/625] eta 0:00:43 lr 0.000087 wd 0.0500 time 0.5694 (0.5781) data time 0.0006 (0.0016) model time 0.5687 (0.5770) loss 6.2943 (6.9105) grad_norm 1.8630 (2.9588) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:17:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][560/625] eta 0:00:37 lr 0.000086 wd 0.0500 time 0.5721 (0.5784) data time 0.0009 (0.0015) model time 0.5713 (0.5773) loss 6.9165 (6.9115) grad_norm 3.6737 (2.9820) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:17:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][570/625] eta 0:00:31 lr 0.000086 wd 0.0500 time 0.5653 (0.5783) data time 0.0006 (0.0015) model time 0.5646 (0.5773) loss 7.1661 (6.9119) grad_norm 1.7461 (2.9723) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:18:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][580/625] eta 0:00:26 lr 0.000086 wd 0.0500 time 0.5725 (0.5786) data time 0.0006 (0.0016) model time 0.5719 (0.5775) loss 6.6149 (6.9037) grad_norm 2.2180 (2.9600) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:18:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][590/625] eta 0:00:20 lr 0.000086 wd 0.0500 time 0.5723 (0.5786) data time 0.0008 (0.0016) model time 0.5715 (0.5775) loss 7.0899 (6.8997) grad_norm 1.8442 (2.9447) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:18:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][600/625] eta 0:00:14 lr 0.000086 wd 0.0500 time 0.5698 (0.5786) data time 0.0008 (0.0016) model time 0.5691 (0.5775) loss 7.1156 (6.8977) grad_norm 2.6187 (2.9482) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:18:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][610/625] eta 0:00:08 lr 0.000086 wd 0.0500 time 0.5703 (0.5785) data time 0.0004 (0.0015) model time 0.5699 (0.5774) loss 6.6611 (6.8962) grad_norm 2.3618 (2.9430) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:18:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [254/300][620/625] eta 0:00:02 lr 0.000086 wd 0.0500 time 0.5706 (0.5785) data time 0.0006 (0.0015) model time 0.5700 (0.5774) loss 6.3749 (6.8924) grad_norm 3.0769 (2.9445) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:18:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 254 training takes 0:06:01 +[2024-07-28 03:18:31 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 03:18:32 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 03:18:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.453 (0.453) Loss 0.4951 (0.4951) Acc@1 90.527 (90.527) Acc@5 98.877 (98.877) Mem 22339MB +[2024-07-28 03:18:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.155) Loss 0.7344 (0.5964) Acc@1 83.203 (88.081) Acc@5 96.924 (98.167) Mem 22339MB +[2024-07-28 03:18:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.141) Loss 0.8159 (0.6825) Acc@1 80.859 (85.472) Acc@5 96.338 (97.359) Mem 22339MB +[2024-07-28 03:18:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.081 Acc@5 97.371 +[2024-07-28 03:18:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.1% +[2024-07-28 03:18:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.784 (0.784) Loss 0.5005 (0.5005) Acc@1 90.430 (90.430) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-28 03:18:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.185) Loss 0.7412 (0.6078) Acc@1 83.057 (88.126) Acc@5 97.021 (98.145) Mem 22339MB +[2024-07-28 03:18:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.157) Loss 0.8345 (0.6926) Acc@1 80.566 (85.366) Acc@5 96.240 (97.354) Mem 22339MB +[2024-07-28 03:18:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.955 Acc@5 97.353 +[2024-07-28 03:18:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.0% +[2024-07-28 03:18:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.95% +[2024-07-28 03:18:39 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-28 03:18:41 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-28 03:18:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][0/625] eta 0:09:16 lr 0.000086 wd 0.0500 time 0.8901 (0.8901) data time 0.3713 (0.3713) model time 0.0000 (0.0000) loss 8.1388 (8.1388) grad_norm 5.1671 (5.1671) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:18:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][10/625] eta 0:06:09 lr 0.000086 wd 0.0500 time 0.5698 (0.6007) data time 0.0009 (0.0345) model time 0.0000 (0.0000) loss 7.1403 (6.7717) grad_norm 3.2221 (3.3707) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:18:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][20/625] eta 0:05:55 lr 0.000086 wd 0.0500 time 0.5716 (0.5872) data time 0.0006 (0.0184) model time 0.0000 (0.0000) loss 6.6247 (6.6695) grad_norm 3.3421 (3.0980) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:19:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][30/625] eta 0:05:50 lr 0.000086 wd 0.0500 time 0.5713 (0.5884) data time 0.0006 (0.0127) model time 0.0000 (0.0000) loss 6.2884 (6.7371) grad_norm 1.7771 (2.8847) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:19:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][40/625] eta 0:05:41 lr 0.000086 wd 0.0500 time 0.5687 (0.5845) data time 0.0009 (0.0098) model time 0.0000 (0.0000) loss 7.4160 (6.8131) grad_norm 2.1341 (2.6959) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:19:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][50/625] eta 0:05:34 lr 0.000086 wd 0.0500 time 0.5694 (0.5825) data time 0.0007 (0.0081) model time 0.0000 (0.0000) loss 6.6247 (6.8598) grad_norm 2.8493 (2.6765) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:19:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][60/625] eta 0:05:28 lr 0.000086 wd 0.0500 time 0.5723 (0.5808) data time 0.0008 (0.0069) model time 0.5715 (0.5713) loss 6.5209 (6.8579) grad_norm 2.7941 (2.7228) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:19:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][70/625] eta 0:05:21 lr 0.000086 wd 0.0500 time 0.5701 (0.5796) data time 0.0008 (0.0060) model time 0.5693 (0.5713) loss 7.8219 (6.9100) grad_norm 2.9580 (2.6718) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:19:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][80/625] eta 0:05:15 lr 0.000086 wd 0.0500 time 0.5709 (0.5786) data time 0.0009 (0.0054) model time 0.5700 (0.5710) loss 7.7216 (6.9397) grad_norm 2.7463 (2.6291) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:19:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][90/625] eta 0:05:09 lr 0.000086 wd 0.0500 time 0.5710 (0.5781) data time 0.0006 (0.0049) model time 0.5703 (0.5717) loss 6.0435 (6.9587) grad_norm 7.6506 (2.6944) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:19:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][100/625] eta 0:05:04 lr 0.000086 wd 0.0500 time 0.5725 (0.5793) data time 0.0008 (0.0045) model time 0.5717 (0.5751) loss 7.0082 (6.9544) grad_norm 2.3336 (2.6709) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:19:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][110/625] eta 0:04:58 lr 0.000086 wd 0.0500 time 0.5737 (0.5791) data time 0.0008 (0.0042) model time 0.5729 (0.5754) loss 5.9976 (6.9174) grad_norm 3.0285 (2.6555) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:19:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][120/625] eta 0:04:52 lr 0.000085 wd 0.0500 time 0.5709 (0.5785) data time 0.0009 (0.0039) model time 0.5700 (0.5748) loss 7.9530 (6.8795) grad_norm 1.7378 (2.5964) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:19:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][130/625] eta 0:04:46 lr 0.000085 wd 0.0500 time 0.5713 (0.5782) data time 0.0008 (0.0037) model time 0.5705 (0.5746) loss 7.2005 (6.8949) grad_norm 3.6036 (3.2745) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:20:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][140/625] eta 0:04:40 lr 0.000085 wd 0.0500 time 0.5671 (0.5778) data time 0.0008 (0.0035) model time 0.5663 (0.5742) loss 6.9249 (6.8815) grad_norm 2.6301 (3.2280) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:20:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][150/625] eta 0:04:34 lr 0.000085 wd 0.0500 time 0.5702 (0.5774) data time 0.0006 (0.0033) model time 0.5696 (0.5740) loss 5.9856 (6.8606) grad_norm 2.2593 (3.1452) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:20:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][160/625] eta 0:04:28 lr 0.000085 wd 0.0500 time 0.5716 (0.5773) data time 0.0009 (0.0031) model time 0.5707 (0.5741) loss 7.8254 (6.8726) grad_norm 1.7445 (3.1476) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:20:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][170/625] eta 0:04:22 lr 0.000085 wd 0.0500 time 0.5736 (0.5773) data time 0.0008 (0.0030) model time 0.5728 (0.5742) loss 7.6075 (6.8740) grad_norm 1.8217 (3.0966) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:20:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][180/625] eta 0:04:16 lr 0.000085 wd 0.0500 time 0.5675 (0.5771) data time 0.0008 (0.0029) model time 0.5666 (0.5741) loss 8.1983 (6.8736) grad_norm 2.9505 (3.0763) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:20:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][190/625] eta 0:04:10 lr 0.000085 wd 0.0500 time 0.5688 (0.5769) data time 0.0006 (0.0028) model time 0.5683 (0.5741) loss 7.5863 (6.8732) grad_norm 2.9497 (3.0486) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:20:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][200/625] eta 0:04:05 lr 0.000085 wd 0.0500 time 0.5703 (0.5767) data time 0.0008 (0.0027) model time 0.5695 (0.5739) loss 7.7771 (6.8735) grad_norm 3.3525 (3.0261) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:20:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][210/625] eta 0:03:59 lr 0.000085 wd 0.0500 time 0.5722 (0.5765) data time 0.0006 (0.0026) model time 0.5716 (0.5738) loss 6.0408 (6.8889) grad_norm 2.6443 (3.0135) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:20:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][220/625] eta 0:03:53 lr 0.000085 wd 0.0500 time 0.5724 (0.5764) data time 0.0006 (0.0025) model time 0.5718 (0.5737) loss 6.4744 (6.8930) grad_norm 2.4536 (2.9925) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:20:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][230/625] eta 0:03:47 lr 0.000085 wd 0.0500 time 0.5725 (0.5764) data time 0.0006 (0.0024) model time 0.5719 (0.5738) loss 7.1231 (6.9031) grad_norm 6.7858 (3.1745) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:21:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][240/625] eta 0:03:41 lr 0.000085 wd 0.0500 time 0.5699 (0.5764) data time 0.0008 (0.0024) model time 0.5691 (0.5739) loss 6.2553 (6.9084) grad_norm 2.3493 (3.1570) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:21:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][250/625] eta 0:03:36 lr 0.000085 wd 0.0500 time 0.5699 (0.5775) data time 0.0008 (0.0023) model time 0.5691 (0.5754) loss 6.0284 (6.8830) grad_norm 2.9725 (3.1269) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:21:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][260/625] eta 0:03:30 lr 0.000085 wd 0.0500 time 0.5693 (0.5773) data time 0.0006 (0.0022) model time 0.5687 (0.5753) loss 6.8512 (6.8741) grad_norm 1.8761 (3.0929) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:21:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][270/625] eta 0:03:24 lr 0.000085 wd 0.0500 time 0.5696 (0.5772) data time 0.0006 (0.0022) model time 0.5690 (0.5751) loss 6.3752 (6.8692) grad_norm 2.4415 (3.0738) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:21:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][280/625] eta 0:03:19 lr 0.000085 wd 0.0500 time 0.5685 (0.5772) data time 0.0009 (0.0021) model time 0.5676 (0.5753) loss 6.8639 (6.8816) grad_norm 2.9395 (3.0713) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:21:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][290/625] eta 0:03:13 lr 0.000085 wd 0.0500 time 0.5686 (0.5772) data time 0.0008 (0.0021) model time 0.5677 (0.5752) loss 7.6189 (6.8862) grad_norm 1.9961 (3.0441) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:21:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][300/625] eta 0:03:07 lr 0.000085 wd 0.0500 time 0.5714 (0.5773) data time 0.0008 (0.0021) model time 0.5706 (0.5754) loss 7.0435 (6.8869) grad_norm 2.1033 (3.0192) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:21:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][310/625] eta 0:03:01 lr 0.000085 wd 0.0500 time 0.5724 (0.5773) data time 0.0007 (0.0021) model time 0.5717 (0.5754) loss 6.6335 (6.8887) grad_norm 1.8655 (3.0011) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:21:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][320/625] eta 0:02:56 lr 0.000084 wd 0.0500 time 0.5764 (0.5779) data time 0.0006 (0.0020) model time 0.5758 (0.5761) loss 6.5310 (6.8999) grad_norm 1.8310 (3.0801) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:21:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][330/625] eta 0:02:50 lr 0.000084 wd 0.0500 time 0.5737 (0.5779) data time 0.0008 (0.0020) model time 0.5730 (0.5761) loss 5.6136 (6.8905) grad_norm 2.0768 (3.0549) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:21:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][340/625] eta 0:02:44 lr 0.000084 wd 0.0500 time 0.5664 (0.5780) data time 0.0006 (0.0019) model time 0.5658 (0.5763) loss 6.5938 (6.8811) grad_norm 2.8283 (3.1065) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:22:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][350/625] eta 0:02:38 lr 0.000084 wd 0.0500 time 0.5695 (0.5781) data time 0.0009 (0.0019) model time 0.5686 (0.5764) loss 5.5733 (6.8835) grad_norm 2.0823 (3.0958) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:22:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][360/625] eta 0:02:33 lr 0.000084 wd 0.0500 time 0.5695 (0.5782) data time 0.0009 (0.0019) model time 0.5686 (0.5766) loss 6.5989 (6.8852) grad_norm 1.8015 (3.1037) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:22:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][370/625] eta 0:02:27 lr 0.000084 wd 0.0500 time 0.5708 (0.5781) data time 0.0008 (0.0019) model time 0.5700 (0.5765) loss 7.5916 (6.8919) grad_norm 2.0657 (3.0837) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:22:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][380/625] eta 0:02:21 lr 0.000084 wd 0.0500 time 0.5718 (0.5780) data time 0.0008 (0.0018) model time 0.5710 (0.5764) loss 7.2808 (6.8876) grad_norm 2.6278 (3.0598) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:22:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][390/625] eta 0:02:15 lr 0.000084 wd 0.0500 time 0.5665 (0.5781) data time 0.0009 (0.0018) model time 0.5656 (0.5765) loss 6.9790 (6.8944) grad_norm 2.3771 (3.0516) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:22:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][400/625] eta 0:02:10 lr 0.000084 wd 0.0500 time 0.5675 (0.5783) data time 0.0008 (0.0018) model time 0.5666 (0.5768) loss 6.3780 (6.8911) grad_norm 2.1385 (3.0289) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:22:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][410/625] eta 0:02:04 lr 0.000084 wd 0.0500 time 0.5730 (0.5782) data time 0.0009 (0.0018) model time 0.5721 (0.5767) loss 8.4064 (6.9005) grad_norm 2.7710 (3.0313) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:22:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][420/625] eta 0:01:58 lr 0.000084 wd 0.0500 time 0.5724 (0.5781) data time 0.0008 (0.0018) model time 0.5716 (0.5766) loss 8.4709 (6.9116) grad_norm 1.9247 (3.0419) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:22:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][430/625] eta 0:01:52 lr 0.000084 wd 0.0500 time 0.5714 (0.5783) data time 0.0009 (0.0017) model time 0.5705 (0.5768) loss 6.3468 (6.9195) grad_norm 7.1359 (3.0439) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:22:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][440/625] eta 0:01:46 lr 0.000084 wd 0.0500 time 0.5688 (0.5782) data time 0.0007 (0.0017) model time 0.5682 (0.5767) loss 6.8734 (6.9227) grad_norm 3.8084 (3.0319) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:23:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][450/625] eta 0:01:41 lr 0.000084 wd 0.0500 time 0.5675 (0.5783) data time 0.0009 (0.0017) model time 0.5667 (0.5769) loss 7.4073 (6.9271) grad_norm 2.2956 (3.0230) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:23:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][460/625] eta 0:01:35 lr 0.000084 wd 0.0500 time 0.5672 (0.5786) data time 0.0007 (0.0017) model time 0.5666 (0.5772) loss 6.4813 (6.9268) grad_norm 1.7895 (3.0177) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:23:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][470/625] eta 0:01:29 lr 0.000084 wd 0.0500 time 0.5710 (0.5793) data time 0.0006 (0.0017) model time 0.5703 (0.5779) loss 5.8398 (6.9187) grad_norm 2.2184 (2.9958) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:23:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][480/625] eta 0:01:23 lr 0.000084 wd 0.0500 time 0.5713 (0.5792) data time 0.0007 (0.0017) model time 0.5706 (0.5779) loss 6.9095 (6.9259) grad_norm 2.2397 (2.9888) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:23:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][490/625] eta 0:01:18 lr 0.000084 wd 0.0500 time 0.5611 (0.5791) data time 0.0009 (0.0017) model time 0.5602 (0.5778) loss 7.5212 (6.9372) grad_norm 3.5562 (3.0304) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:23:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][500/625] eta 0:01:12 lr 0.000084 wd 0.0500 time 0.5713 (0.5791) data time 0.0009 (0.0016) model time 0.5705 (0.5778) loss 6.8673 (6.9366) grad_norm 2.0688 (3.0128) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:23:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][510/625] eta 0:01:06 lr 0.000084 wd 0.0500 time 0.5684 (0.5790) data time 0.0008 (0.0016) model time 0.5676 (0.5777) loss 7.2398 (6.9376) grad_norm 1.6573 (3.0020) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:23:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][520/625] eta 0:01:00 lr 0.000083 wd 0.0500 time 0.5718 (0.5789) data time 0.0008 (0.0016) model time 0.5710 (0.5776) loss 7.2308 (6.9337) grad_norm 2.9741 (3.0061) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:23:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][530/625] eta 0:00:55 lr 0.000083 wd 0.0500 time 0.5736 (0.5790) data time 0.0008 (0.0016) model time 0.5728 (0.5777) loss 8.1411 (6.9318) grad_norm 1.9888 (2.9947) loss_scale 256.0000 (128.4821) mem 22339MB +[2024-07-28 03:23:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][540/625] eta 0:00:49 lr 0.000083 wd 0.0500 time 0.5713 (0.5792) data time 0.0008 (0.0016) model time 0.5705 (0.5779) loss 5.8471 (6.9276) grad_norm 4.6213 (3.0053) loss_scale 256.0000 (130.8392) mem 22339MB +[2024-07-28 03:24:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][550/625] eta 0:00:43 lr 0.000083 wd 0.0500 time 0.5679 (0.5791) data time 0.0009 (0.0016) model time 0.5670 (0.5779) loss 6.8370 (6.9328) grad_norm 2.3708 (2.9943) loss_scale 256.0000 (133.1107) mem 22339MB +[2024-07-28 03:24:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][560/625] eta 0:00:37 lr 0.000083 wd 0.0500 time 0.5735 (0.5791) data time 0.0009 (0.0016) model time 0.5727 (0.5778) loss 7.2012 (6.9284) grad_norm 1.6657 (2.9827) loss_scale 256.0000 (135.3012) mem 22339MB +[2024-07-28 03:24:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][570/625] eta 0:00:31 lr 0.000083 wd 0.0500 time 0.5703 (0.5789) data time 0.0008 (0.0015) model time 0.5694 (0.5777) loss 7.6055 (6.9301) grad_norm 4.1545 (2.9819) loss_scale 256.0000 (137.4151) mem 22339MB +[2024-07-28 03:24:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][580/625] eta 0:00:26 lr 0.000083 wd 0.0500 time 0.5702 (0.5788) data time 0.0006 (0.0015) model time 0.5696 (0.5775) loss 7.2918 (6.9329) grad_norm 3.1492 (2.9826) loss_scale 256.0000 (139.4561) mem 22339MB +[2024-07-28 03:24:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][590/625] eta 0:00:20 lr 0.000083 wd 0.0500 time 0.5705 (0.5788) data time 0.0008 (0.0015) model time 0.5697 (0.5776) loss 6.6805 (6.9362) grad_norm 1.5861 (3.0023) loss_scale 256.0000 (141.4281) mem 22339MB +[2024-07-28 03:24:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][600/625] eta 0:00:14 lr 0.000083 wd 0.0500 time 0.5667 (0.5790) data time 0.0006 (0.0015) model time 0.5661 (0.5777) loss 7.2383 (6.9408) grad_norm 2.3894 (3.0245) loss_scale 256.0000 (143.3344) mem 22339MB +[2024-07-28 03:24:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][610/625] eta 0:00:08 lr 0.000083 wd 0.0500 time 0.5699 (0.5789) data time 0.0004 (0.0015) model time 0.5695 (0.5777) loss 7.4010 (6.9403) grad_norm 2.3144 (3.0205) loss_scale 256.0000 (145.1784) mem 22339MB +[2024-07-28 03:24:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [255/300][620/625] eta 0:00:02 lr 0.000083 wd 0.0500 time 0.5772 (0.5790) data time 0.0004 (0.0015) model time 0.5769 (0.5778) loss 7.4404 (6.9357) grad_norm 2.4202 (3.0121) loss_scale 256.0000 (146.9630) mem 22339MB +[2024-07-28 03:24:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 255 training takes 0:06:01 +[2024-07-28 03:24:43 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 03:24:45 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 03:24:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.462 (0.462) Loss 0.4905 (0.4905) Acc@1 90.430 (90.430) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-28 03:24:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.165) Loss 0.7393 (0.5999) Acc@1 83.203 (88.135) Acc@5 97.217 (98.140) Mem 22339MB +[2024-07-28 03:24:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.146) Loss 0.8232 (0.6862) Acc@1 80.566 (85.519) Acc@5 96.484 (97.391) Mem 22339MB +[2024-07-28 03:24:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.123 Acc@5 97.381 +[2024-07-28 03:24:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.1% +[2024-07-28 03:24:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 85.12% +[2024-07-28 03:24:48 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-28 03:24:50 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-28 03:24:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.460 (0.460) Loss 0.5005 (0.5005) Acc@1 90.381 (90.381) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-28 03:24:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.156) Loss 0.7422 (0.6076) Acc@1 82.959 (88.095) Acc@5 97.119 (98.149) Mem 22339MB +[2024-07-28 03:24:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.144) Loss 0.8340 (0.6922) Acc@1 80.566 (85.354) Acc@5 96.240 (97.363) Mem 22339MB +[2024-07-28 03:24:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 84.957 Acc@5 97.361 +[2024-07-28 03:24:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.0% +[2024-07-28 03:24:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 84.96% +[2024-07-28 03:24:53 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-28 03:24:55 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-28 03:24:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][0/625] eta 0:09:51 lr 0.000083 wd 0.0500 time 0.9460 (0.9460) data time 0.4252 (0.4252) model time 0.0000 (0.0000) loss 5.8257 (5.8257) grad_norm 2.5441 (2.5441) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 03:25:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][10/625] eta 0:06:12 lr 0.000083 wd 0.0500 time 0.5687 (0.6062) data time 0.0008 (0.0394) model time 0.0000 (0.0000) loss 7.0034 (6.6140) grad_norm 2.0043 (2.3392) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 03:25:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][20/625] eta 0:05:58 lr 0.000083 wd 0.0500 time 0.5714 (0.5925) data time 0.0007 (0.0211) model time 0.0000 (0.0000) loss 7.5151 (6.7865) grad_norm 2.0199 (2.5419) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 03:25:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][30/625] eta 0:05:52 lr 0.000083 wd 0.0500 time 0.6728 (0.5918) data time 0.0007 (0.0146) model time 0.0000 (0.0000) loss 7.7361 (6.8438) grad_norm 4.2982 (2.5428) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 03:25:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][40/625] eta 0:05:44 lr 0.000083 wd 0.0500 time 0.5689 (0.5891) data time 0.0009 (0.0113) model time 0.0000 (0.0000) loss 7.1289 (6.8542) grad_norm 2.4090 (2.6808) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 03:25:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][50/625] eta 0:05:37 lr 0.000083 wd 0.0500 time 0.5714 (0.5862) data time 0.0009 (0.0093) model time 0.0000 (0.0000) loss 7.2754 (6.9520) grad_norm 2.2858 (2.6911) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 03:25:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][60/625] eta 0:05:29 lr 0.000083 wd 0.0500 time 0.5716 (0.5840) data time 0.0008 (0.0080) model time 0.5709 (0.5715) loss 7.0847 (6.9739) grad_norm 2.3596 (2.6371) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 03:25:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][70/625] eta 0:05:24 lr 0.000083 wd 0.0500 time 0.5714 (0.5856) data time 0.0007 (0.0069) model time 0.5707 (0.5830) loss 8.0261 (6.9505) grad_norm 4.1230 (2.6310) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 03:25:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][80/625] eta 0:05:18 lr 0.000083 wd 0.0500 time 0.5722 (0.5840) data time 0.0006 (0.0062) model time 0.5716 (0.5795) loss 5.9394 (6.9463) grad_norm 2.0040 (2.5743) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 03:25:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][90/625] eta 0:05:11 lr 0.000082 wd 0.0500 time 0.5726 (0.5829) data time 0.0006 (0.0056) model time 0.5720 (0.5777) loss 6.9177 (6.9712) grad_norm 2.8319 (2.5825) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 03:25:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][100/625] eta 0:05:05 lr 0.000082 wd 0.0500 time 0.5711 (0.5818) data time 0.0008 (0.0051) model time 0.5703 (0.5765) loss 7.0132 (6.9786) grad_norm 2.3020 (2.6125) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 03:25:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][110/625] eta 0:04:59 lr 0.000082 wd 0.0500 time 0.5730 (0.5813) data time 0.0006 (0.0047) model time 0.5724 (0.5763) loss 6.6705 (6.9624) grad_norm 2.1262 (2.6556) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 03:26:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][120/625] eta 0:04:53 lr 0.000082 wd 0.0500 time 0.5798 (0.5807) data time 0.0008 (0.0044) model time 0.5789 (0.5759) loss 6.1903 (6.9775) grad_norm 3.0726 (2.8523) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 03:26:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][130/625] eta 0:04:47 lr 0.000082 wd 0.0500 time 0.5731 (0.5803) data time 0.0006 (0.0041) model time 0.5725 (0.5757) loss 5.9026 (6.9758) grad_norm 2.3501 (2.9572) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 03:26:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][140/625] eta 0:04:41 lr 0.000082 wd 0.0500 time 0.5716 (0.5798) data time 0.0009 (0.0039) model time 0.5707 (0.5754) loss 6.4079 (6.9738) grad_norm 4.0315 (3.0541) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 03:26:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][150/625] eta 0:04:35 lr 0.000082 wd 0.0500 time 0.5721 (0.5794) data time 0.0009 (0.0037) model time 0.5712 (0.5751) loss 7.2465 (6.9790) grad_norm 2.2202 (3.0879) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 03:26:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][160/625] eta 0:04:29 lr 0.000082 wd 0.0500 time 0.5713 (0.5790) data time 0.0006 (0.0035) model time 0.5707 (0.5749) loss 7.0926 (6.9572) grad_norm 2.0784 (3.0488) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 03:26:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][170/625] eta 0:04:23 lr 0.000082 wd 0.0500 time 0.5706 (0.5786) data time 0.0006 (0.0034) model time 0.5700 (0.5746) loss 7.5108 (6.9323) grad_norm 2.2372 (3.0035) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 03:26:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][180/625] eta 0:04:17 lr 0.000082 wd 0.0500 time 0.5705 (0.5783) data time 0.0007 (0.0032) model time 0.5698 (0.5744) loss 6.4882 (6.9219) grad_norm 2.6649 (3.1211) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 03:26:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][190/625] eta 0:04:11 lr 0.000082 wd 0.0500 time 0.5747 (0.5782) data time 0.0009 (0.0031) model time 0.5738 (0.5744) loss 6.1104 (6.9396) grad_norm 1.8744 (3.2011) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 03:26:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][200/625] eta 0:04:05 lr 0.000082 wd 0.0500 time 0.5667 (0.5779) data time 0.0008 (0.0030) model time 0.5659 (0.5743) loss 7.8661 (6.9464) grad_norm 1.9978 (3.1577) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 03:26:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][210/625] eta 0:03:59 lr 0.000082 wd 0.0500 time 0.5732 (0.5777) data time 0.0008 (0.0029) model time 0.5724 (0.5741) loss 7.1638 (6.9351) grad_norm 2.7426 (3.1267) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 03:27:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][220/625] eta 0:03:53 lr 0.000082 wd 0.0500 time 0.5707 (0.5775) data time 0.0008 (0.0028) model time 0.5699 (0.5740) loss 7.1034 (6.9338) grad_norm 1.7356 (3.0825) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 03:27:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][230/625] eta 0:03:48 lr 0.000082 wd 0.0500 time 0.5712 (0.5773) data time 0.0009 (0.0027) model time 0.5704 (0.5740) loss 5.7175 (6.9279) grad_norm 1.6105 (3.0390) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 03:27:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][240/625] eta 0:03:42 lr 0.000082 wd 0.0500 time 0.5730 (0.5771) data time 0.0009 (0.0026) model time 0.5722 (0.5738) loss 7.6736 (6.9297) grad_norm 1.7632 (3.0058) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 03:27:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][250/625] eta 0:03:36 lr 0.000082 wd 0.0500 time 0.5684 (0.5775) data time 0.0008 (0.0026) model time 0.5676 (0.5745) loss 7.4742 (6.9144) grad_norm 1.6955 (2.9778) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 03:27:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][260/625] eta 0:03:30 lr 0.000082 wd 0.0500 time 0.5719 (0.5774) data time 0.0008 (0.0025) model time 0.5710 (0.5745) loss 6.5820 (6.9032) grad_norm 2.0546 (2.9554) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 03:27:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][270/625] eta 0:03:24 lr 0.000082 wd 0.0500 time 0.5724 (0.5774) data time 0.0006 (0.0024) model time 0.5717 (0.5745) loss 7.9300 (6.9001) grad_norm 2.0507 (2.9371) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 03:27:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][280/625] eta 0:03:19 lr 0.000082 wd 0.0500 time 0.5716 (0.5772) data time 0.0008 (0.0024) model time 0.5708 (0.5744) loss 8.5355 (6.9127) grad_norm 2.0022 (inf) loss_scale 128.0000 (254.6335) mem 22339MB +[2024-07-28 03:27:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][290/625] eta 0:03:13 lr 0.000081 wd 0.0500 time 0.5772 (0.5786) data time 0.0009 (0.0023) model time 0.5764 (0.5762) loss 7.4226 (6.9234) grad_norm 2.4081 (inf) loss_scale 128.0000 (250.2818) mem 22339MB +[2024-07-28 03:27:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][300/625] eta 0:03:07 lr 0.000081 wd 0.0500 time 0.5693 (0.5784) data time 0.0009 (0.0023) model time 0.5684 (0.5760) loss 6.0683 (6.9185) grad_norm 3.2214 (inf) loss_scale 128.0000 (246.2193) mem 22339MB +[2024-07-28 03:27:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][310/625] eta 0:03:02 lr 0.000081 wd 0.0500 time 0.5725 (0.5782) data time 0.0008 (0.0022) model time 0.5716 (0.5758) loss 7.5469 (6.9129) grad_norm 3.7795 (inf) loss_scale 128.0000 (242.4180) mem 22339MB +[2024-07-28 03:28:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][320/625] eta 0:02:56 lr 0.000081 wd 0.0500 time 0.5708 (0.5783) data time 0.0006 (0.0022) model time 0.5701 (0.5759) loss 6.9912 (6.9268) grad_norm 1.8656 (inf) loss_scale 128.0000 (238.8536) mem 22339MB +[2024-07-28 03:28:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][330/625] eta 0:02:50 lr 0.000081 wd 0.0500 time 0.5702 (0.5784) data time 0.0007 (0.0022) model time 0.5695 (0.5761) loss 6.4930 (6.9032) grad_norm 2.6021 (inf) loss_scale 128.0000 (235.5045) mem 22339MB +[2024-07-28 03:28:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][340/625] eta 0:02:44 lr 0.000081 wd 0.0500 time 0.5695 (0.5786) data time 0.0008 (0.0022) model time 0.5687 (0.5763) loss 6.6636 (6.8914) grad_norm 2.5995 (inf) loss_scale 128.0000 (232.3519) mem 22339MB +[2024-07-28 03:28:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][350/625] eta 0:02:39 lr 0.000081 wd 0.0500 time 0.5720 (0.5786) data time 0.0007 (0.0021) model time 0.5713 (0.5764) loss 7.4943 (6.8858) grad_norm 2.1913 (inf) loss_scale 128.0000 (229.3789) mem 22339MB +[2024-07-28 03:28:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][360/625] eta 0:02:33 lr 0.000081 wd 0.0500 time 0.5704 (0.5785) data time 0.0006 (0.0021) model time 0.5698 (0.5764) loss 7.1807 (6.8904) grad_norm 1.5408 (inf) loss_scale 128.0000 (226.5706) mem 22339MB +[2024-07-28 03:28:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][370/625] eta 0:02:27 lr 0.000081 wd 0.0500 time 0.5703 (0.5785) data time 0.0008 (0.0020) model time 0.5695 (0.5764) loss 7.1848 (6.9003) grad_norm 3.2510 (inf) loss_scale 128.0000 (223.9137) mem 22339MB +[2024-07-28 03:28:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][380/625] eta 0:02:21 lr 0.000081 wd 0.0500 time 0.5693 (0.5785) data time 0.0010 (0.0020) model time 0.5683 (0.5765) loss 8.5660 (6.9087) grad_norm 1.6998 (inf) loss_scale 128.0000 (221.3963) mem 22339MB +[2024-07-28 03:28:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][390/625] eta 0:02:15 lr 0.000081 wd 0.0500 time 0.5706 (0.5786) data time 0.0007 (0.0020) model time 0.5699 (0.5765) loss 6.5774 (6.9110) grad_norm 13.4730 (inf) loss_scale 128.0000 (219.0077) mem 22339MB +[2024-07-28 03:28:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][400/625] eta 0:02:10 lr 0.000081 wd 0.0500 time 0.5730 (0.5786) data time 0.0006 (0.0020) model time 0.5724 (0.5766) loss 7.6193 (6.9044) grad_norm 1.9196 (inf) loss_scale 128.0000 (216.7382) mem 22339MB +[2024-07-28 03:28:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][410/625] eta 0:02:04 lr 0.000081 wd 0.0500 time 0.5733 (0.5785) data time 0.0009 (0.0020) model time 0.5724 (0.5765) loss 7.7726 (6.9068) grad_norm 2.9598 (inf) loss_scale 128.0000 (214.5791) mem 22339MB +[2024-07-28 03:28:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-28 03:28:56 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 03:28:57 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 03:37:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-28 03:37:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-28 03:40:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-28 03:40:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-28 03:41:05 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-28 03:41:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-28 03:41:36 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-28 03:41:36 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-28 03:41:37 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-28 03:41:37 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 256) +[2024-07-28 03:41:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-28 03:41:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][420/625] eta 0:09:42 lr 0.000081 wd 0.0500 time 0.5803 (2.8411) data time 0.0009 (0.1363) model time 0.5794 (2.7049) loss 7.0921 (7.2049) grad_norm 2.5072 (3.0505) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:42:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][430/625] eta 0:04:20 lr 0.000081 wd 0.0500 time 0.5802 (1.3351) data time 0.0011 (0.0464) model time 0.5791 (1.2887) loss 6.7681 (7.1310) grad_norm 1.9132 (2.5318) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:42:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][440/625] eta 0:03:11 lr 0.000081 wd 0.0500 time 0.5742 (1.0348) data time 0.0011 (0.0283) model time 0.5731 (1.0065) loss 7.1470 (7.1392) grad_norm 3.1635 (2.5310) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:42:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][450/625] eta 0:02:38 lr 0.000081 wd 0.0500 time 0.5784 (0.9055) data time 0.0012 (0.0206) model time 0.5772 (0.8849) loss 7.0032 (7.1189) grad_norm 2.7694 (2.5345) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:42:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][460/625] eta 0:02:17 lr 0.000081 wd 0.0500 time 0.5808 (0.8337) data time 0.0011 (0.0163) model time 0.5797 (0.8174) loss 7.9735 (7.0411) grad_norm 7.9809 (2.6292) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:42:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][470/625] eta 0:02:03 lr 0.000081 wd 0.0500 time 0.8196 (0.7958) data time 0.0009 (0.0135) model time 0.8187 (0.7823) loss 6.1807 (7.0098) grad_norm 2.1245 (2.5488) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:42:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][480/625] eta 0:01:50 lr 0.000081 wd 0.0500 time 0.5856 (0.7632) data time 0.0011 (0.0116) model time 0.5846 (0.7516) loss 7.3877 (6.9842) grad_norm 1.9940 (2.5821) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:42:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][490/625] eta 0:01:39 lr 0.000080 wd 0.0500 time 0.5847 (0.7396) data time 0.0011 (0.0102) model time 0.5837 (0.7294) loss 5.7692 (6.9284) grad_norm 2.0364 (2.5502) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:42:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][500/625] eta 0:01:30 lr 0.000080 wd 0.0500 time 0.5865 (0.7219) data time 0.0009 (0.0092) model time 0.5857 (0.7126) loss 6.1451 (6.9211) grad_norm 3.0695 (2.5895) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:42:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][510/625] eta 0:01:21 lr 0.000080 wd 0.0500 time 0.5851 (0.7081) data time 0.0011 (0.0084) model time 0.5840 (0.6997) loss 7.4486 (6.9266) grad_norm 2.2387 (2.5784) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:42:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][520/625] eta 0:01:13 lr 0.000080 wd 0.0500 time 0.5842 (0.6963) data time 0.0010 (0.0077) model time 0.5832 (0.6886) loss 6.5592 (6.9587) grad_norm 1.9009 (2.5610) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:43:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][530/625] eta 0:01:05 lr 0.000080 wd 0.0500 time 0.5795 (0.6868) data time 0.0008 (0.0071) model time 0.5787 (0.6797) loss 5.9415 (6.9717) grad_norm 2.2783 (2.5324) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:43:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][540/625] eta 0:00:57 lr 0.000080 wd 0.0500 time 0.5800 (0.6789) data time 0.0008 (0.0066) model time 0.5792 (0.6723) loss 6.2577 (6.9677) grad_norm 2.2797 (2.5101) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:43:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][550/625] eta 0:00:50 lr 0.000080 wd 0.0500 time 0.5880 (0.6722) data time 0.0009 (0.0062) model time 0.5872 (0.6660) loss 6.7540 (6.9782) grad_norm 3.0636 (2.5192) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:43:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][560/625] eta 0:00:43 lr 0.000080 wd 0.0500 time 0.5851 (0.6664) data time 0.0011 (0.0058) model time 0.5840 (0.6606) loss 6.9673 (6.9575) grad_norm 5.9800 (2.6021) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:43:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][570/625] eta 0:00:36 lr 0.000080 wd 0.0500 time 0.5880 (0.6616) data time 0.0011 (0.0055) model time 0.5870 (0.6561) loss 7.6594 (6.9547) grad_norm 2.0501 (2.6255) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:43:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][580/625] eta 0:00:29 lr 0.000080 wd 0.0500 time 0.5844 (0.6571) data time 0.0012 (0.0053) model time 0.5832 (0.6518) loss 7.7981 (6.9564) grad_norm 2.2007 (2.6052) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:43:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][590/625] eta 0:00:22 lr 0.000080 wd 0.0500 time 0.5841 (0.6529) data time 0.0008 (0.0050) model time 0.5833 (0.6479) loss 7.3090 (6.9611) grad_norm 2.1966 (2.6404) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:43:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][600/625] eta 0:00:16 lr 0.000080 wd 0.0500 time 0.5815 (0.6492) data time 0.0010 (0.0048) model time 0.5804 (0.6444) loss 6.9852 (6.9696) grad_norm 2.8577 (2.6505) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:43:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][610/625] eta 0:00:09 lr 0.000080 wd 0.0500 time 0.5854 (0.6458) data time 0.0008 (0.0046) model time 0.5846 (0.6412) loss 5.5224 (6.9537) grad_norm 1.8741 (2.6483) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:43:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [256/300][620/625] eta 0:00:03 lr 0.000080 wd 0.0500 time 0.5859 (0.6428) data time 0.0008 (0.0045) model time 0.5851 (0.6383) loss 6.6148 (6.9429) grad_norm 2.0908 (2.6620) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:43:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 256 training takes 0:02:14 +[2024-07-28 03:43:56 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 03:44:02 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 03:44:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.512 (0.512) Loss 0.4883 (0.4883) Acc@1 90.576 (90.576) Acc@5 98.926 (98.926) Mem 22341MB +[2024-07-28 03:44:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.161) Loss 0.7314 (0.5950) Acc@1 83.203 (88.077) Acc@5 97.412 (98.198) Mem 22341MB +[2024-07-28 03:44:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.144) Loss 0.8271 (0.6805) Acc@1 80.420 (85.447) Acc@5 96.289 (97.396) Mem 22341MB +[2024-07-28 03:44:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.065 Acc@5 97.389 +[2024-07-28 03:44:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.1% +[2024-07-28 03:44:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.839 (0.839) Loss 0.4998 (0.4998) Acc@1 90.381 (90.381) Acc@5 98.975 (98.975) Mem 22341MB +[2024-07-28 03:44:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.191) Loss 0.7412 (0.6066) Acc@1 83.008 (88.108) Acc@5 97.021 (98.153) Mem 22341MB +[2024-07-28 03:44:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.160) Loss 0.8325 (0.6913) Acc@1 80.713 (85.403) Acc@5 96.289 (97.368) Mem 22341MB +[2024-07-28 03:44:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.013 Acc@5 97.367 +[2024-07-28 03:44:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.0% +[2024-07-28 03:44:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.01% +[2024-07-28 03:44:12 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-28 03:44:16 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-28 03:44:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][0/625] eta 0:11:45 lr 0.000080 wd 0.0500 time 1.1281 (1.1281) data time 0.4138 (0.4138) model time 0.0000 (0.0000) loss 6.6127 (6.6127) grad_norm 1.9826 (1.9826) loss_scale 128.0000 (128.0000) mem 22337MB +[2024-07-28 03:44:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][10/625] eta 0:06:37 lr 0.000080 wd 0.0500 time 0.5826 (0.6459) data time 0.0012 (0.0386) model time 0.0000 (0.0000) loss 6.4114 (6.5266) grad_norm 3.6335 (3.0012) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:44:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-28 03:44:24 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 03:44:29 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 03:46:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-28 03:46:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-28 03:46:48 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-28 03:47:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-28 03:47:01 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-28 03:47:02 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-28 03:47:02 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-28 03:47:02 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 257) +[2024-07-28 03:47:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-28 03:47:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][20/625] eta 0:15:59 lr 0.000080 wd 0.0500 time 0.5717 (1.5865) data time 0.0009 (0.0599) model time 0.0000 (0.0000) loss 7.6214 (7.2428) grad_norm 2.7729 (3.9172) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:47:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][30/625] eta 0:10:42 lr 0.000080 wd 0.0500 time 0.5772 (1.0799) data time 0.0006 (0.0304) model time 0.0000 (0.0000) loss 7.0591 (7.1106) grad_norm 3.3010 (3.7328) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:47:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][40/625] eta 0:08:54 lr 0.000080 wd 0.0500 time 0.5715 (0.9137) data time 0.0009 (0.0206) model time 0.0000 (0.0000) loss 7.8582 (7.1361) grad_norm 1.8138 (3.4163) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:47:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][50/625] eta 0:07:56 lr 0.000080 wd 0.0500 time 0.5733 (0.8284) data time 0.0007 (0.0156) model time 0.0000 (0.0000) loss 5.9554 (7.0148) grad_norm 1.8665 (3.2052) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:47:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][60/625] eta 0:07:19 lr 0.000080 wd 0.0500 time 0.5951 (0.7778) data time 0.0008 (0.0127) model time 0.5943 (0.5746) loss 6.4863 (6.9777) grad_norm 2.3890 (2.9890) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:47:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][70/625] eta 0:06:56 lr 0.000079 wd 0.0500 time 0.6205 (0.7510) data time 0.0006 (0.0107) model time 0.6198 (0.5953) loss 7.1469 (6.9723) grad_norm 2.0849 (2.8578) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:47:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][80/625] eta 0:06:35 lr 0.000079 wd 0.0500 time 0.5756 (0.7265) data time 0.0007 (0.0093) model time 0.5749 (0.5898) loss 6.3386 (6.9405) grad_norm 2.7496 (2.8750) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:48:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][90/625] eta 0:06:18 lr 0.000079 wd 0.0500 time 0.5777 (0.7084) data time 0.0008 (0.0083) model time 0.5770 (0.5875) loss 7.8813 (6.9349) grad_norm 2.3578 (2.7937) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:48:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][100/625] eta 0:06:04 lr 0.000079 wd 0.0500 time 0.5819 (0.6940) data time 0.0006 (0.0074) model time 0.5812 (0.5857) loss 7.4494 (6.9159) grad_norm 1.7955 (2.7226) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:48:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][110/625] eta 0:05:51 lr 0.000079 wd 0.0500 time 0.5758 (0.6826) data time 0.0009 (0.0068) model time 0.5750 (0.5846) loss 7.5911 (6.9382) grad_norm 2.4461 (2.7711) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:48:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][120/625] eta 0:05:39 lr 0.000079 wd 0.0500 time 0.5773 (0.6730) data time 0.0009 (0.0062) model time 0.5764 (0.5834) loss 6.3260 (6.9340) grad_norm 1.6591 (2.8347) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:48:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][130/625] eta 0:05:29 lr 0.000079 wd 0.0500 time 0.5757 (0.6650) data time 0.0007 (0.0058) model time 0.5749 (0.5825) loss 7.8398 (6.9466) grad_norm 3.1444 (2.9066) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:48:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][140/625] eta 0:05:19 lr 0.000079 wd 0.0500 time 0.5865 (0.6587) data time 0.0006 (0.0054) model time 0.5859 (0.5825) loss 6.6031 (6.9361) grad_norm 1.9437 (2.8953) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:48:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][150/625] eta 0:05:10 lr 0.000079 wd 0.0500 time 0.5787 (0.6531) data time 0.0007 (0.0051) model time 0.5781 (0.5821) loss 5.3050 (6.9177) grad_norm 1.8104 (2.8682) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:48:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][160/625] eta 0:05:01 lr 0.000079 wd 0.0500 time 0.5845 (0.6482) data time 0.0008 (0.0048) model time 0.5837 (0.5819) loss 7.7968 (6.9113) grad_norm 2.0987 (2.8762) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:48:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][170/625] eta 0:04:53 lr 0.000079 wd 0.0500 time 0.5847 (0.6440) data time 0.0008 (0.0045) model time 0.5839 (0.5817) loss 6.9795 (6.9216) grad_norm 4.4119 (2.9046) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:48:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][180/625] eta 0:04:44 lr 0.000079 wd 0.0500 time 0.5817 (0.6402) data time 0.0007 (0.0043) model time 0.5810 (0.5815) loss 5.8387 (6.9231) grad_norm 1.9620 (2.8580) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:49:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][190/625] eta 0:04:36 lr 0.000079 wd 0.0500 time 0.5768 (0.6367) data time 0.0007 (0.0041) model time 0.5761 (0.5810) loss 6.6410 (6.9110) grad_norm 2.1662 (2.8246) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:49:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][200/625] eta 0:04:29 lr 0.000079 wd 0.0500 time 0.5835 (0.6336) data time 0.0006 (0.0040) model time 0.5828 (0.5808) loss 6.0721 (6.9158) grad_norm 2.9068 (2.8042) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:49:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][210/625] eta 0:04:21 lr 0.000079 wd 0.0500 time 0.5751 (0.6310) data time 0.0008 (0.0038) model time 0.5743 (0.5808) loss 7.3057 (6.9034) grad_norm 2.3478 (2.7825) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:49:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][220/625] eta 0:04:14 lr 0.000079 wd 0.0500 time 0.5783 (0.6285) data time 0.0006 (0.0037) model time 0.5777 (0.5806) loss 6.2839 (6.8890) grad_norm 2.3103 (2.7894) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:49:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][230/625] eta 0:04:07 lr 0.000079 wd 0.0500 time 0.5792 (0.6265) data time 0.0009 (0.0035) model time 0.5784 (0.5808) loss 6.6492 (6.8839) grad_norm 3.0821 (2.7763) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:49:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][240/625] eta 0:04:00 lr 0.000079 wd 0.0500 time 0.6263 (0.6250) data time 0.0009 (0.0034) model time 0.6254 (0.5813) loss 7.9933 (6.8967) grad_norm 2.2125 (2.8019) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:49:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][250/625] eta 0:03:53 lr 0.000079 wd 0.0500 time 0.5803 (0.6232) data time 0.0009 (0.0033) model time 0.5794 (0.5813) loss 8.2379 (6.8904) grad_norm 2.7819 (2.8161) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:49:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][260/625] eta 0:03:46 lr 0.000079 wd 0.0500 time 0.5772 (0.6215) data time 0.0007 (0.0032) model time 0.5765 (0.5812) loss 5.5718 (6.8855) grad_norm 2.3183 (2.8135) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:49:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][270/625] eta 0:03:40 lr 0.000078 wd 0.0500 time 0.5753 (0.6197) data time 0.0009 (0.0031) model time 0.5744 (0.5809) loss 5.1837 (6.8759) grad_norm 2.2751 (2.8050) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:49:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][280/625] eta 0:03:33 lr 0.000078 wd 0.0500 time 0.5803 (0.6181) data time 0.0007 (0.0031) model time 0.5796 (0.5807) loss 7.1987 (6.8677) grad_norm 2.1082 (2.8239) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:49:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][290/625] eta 0:03:26 lr 0.000078 wd 0.0500 time 0.5809 (0.6175) data time 0.0009 (0.0030) model time 0.5801 (0.5815) loss 6.8280 (6.8743) grad_norm 1.8220 (2.8010) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:50:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][300/625] eta 0:03:20 lr 0.000078 wd 0.0500 time 0.5799 (0.6162) data time 0.0008 (0.0029) model time 0.5791 (0.5814) loss 6.0453 (6.8707) grad_norm 2.3732 (2.7776) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:50:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][310/625] eta 0:03:13 lr 0.000078 wd 0.0500 time 0.5986 (0.6150) data time 0.0007 (0.0028) model time 0.5979 (0.5814) loss 6.5592 (6.8595) grad_norm 10.3940 (2.7976) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:50:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][320/625] eta 0:03:07 lr 0.000078 wd 0.0500 time 0.5771 (0.6139) data time 0.0008 (0.0028) model time 0.5763 (0.5812) loss 6.6732 (6.8598) grad_norm 1.8985 (2.7809) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:50:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][330/625] eta 0:03:00 lr 0.000078 wd 0.0500 time 0.5792 (0.6128) data time 0.0009 (0.0027) model time 0.5783 (0.5811) loss 6.5897 (6.8779) grad_norm 2.8804 (2.7581) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:50:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][340/625] eta 0:02:54 lr 0.000078 wd 0.0500 time 0.5772 (0.6117) data time 0.0007 (0.0027) model time 0.5765 (0.5809) loss 6.9708 (6.8787) grad_norm 2.8219 (2.7541) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:50:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][350/625] eta 0:02:47 lr 0.000078 wd 0.0500 time 0.5859 (0.6106) data time 0.0009 (0.0026) model time 0.5850 (0.5807) loss 6.8531 (6.8749) grad_norm 2.1829 (2.7577) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:50:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][360/625] eta 0:02:41 lr 0.000078 wd 0.0500 time 0.5792 (0.6097) data time 0.0008 (0.0026) model time 0.5784 (0.5806) loss 6.3354 (6.8731) grad_norm 4.1287 (2.7648) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:50:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][370/625] eta 0:02:35 lr 0.000078 wd 0.0500 time 0.5851 (0.6089) data time 0.0007 (0.0025) model time 0.5843 (0.5806) loss 7.4060 (6.8774) grad_norm 1.7958 (2.7596) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:50:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][380/625] eta 0:02:28 lr 0.000078 wd 0.0500 time 0.5815 (0.6081) data time 0.0009 (0.0025) model time 0.5806 (0.5805) loss 7.7986 (6.8778) grad_norm 3.7945 (2.7576) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:50:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][390/625] eta 0:02:22 lr 0.000078 wd 0.0500 time 0.5807 (0.6074) data time 0.0006 (0.0024) model time 0.5801 (0.5805) loss 5.8027 (6.8797) grad_norm 1.8197 (2.7504) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:51:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][400/625] eta 0:02:16 lr 0.000078 wd 0.0500 time 0.5980 (0.6067) data time 0.0008 (0.0024) model time 0.5971 (0.5806) loss 5.4009 (6.8725) grad_norm 2.5652 (2.7544) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:51:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][410/625] eta 0:02:10 lr 0.000078 wd 0.0500 time 0.5753 (0.6060) data time 0.0009 (0.0023) model time 0.5745 (0.5804) loss 5.6641 (6.8749) grad_norm 2.0184 (2.7549) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:51:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][420/625] eta 0:02:04 lr 0.000078 wd 0.0500 time 0.5775 (0.6054) data time 0.0006 (0.0023) model time 0.5769 (0.5804) loss 7.4355 (6.8784) grad_norm 2.2871 (2.7449) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:51:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][430/625] eta 0:01:57 lr 0.000078 wd 0.0500 time 0.5889 (0.6047) data time 0.0007 (0.0023) model time 0.5882 (0.5803) loss 6.5865 (6.8749) grad_norm 3.0478 (2.7439) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:51:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][440/625] eta 0:01:51 lr 0.000078 wd 0.0500 time 0.5802 (0.6041) data time 0.0009 (0.0022) model time 0.5794 (0.5803) loss 7.5136 (6.8796) grad_norm 3.4851 (2.7372) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:51:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][450/625] eta 0:01:45 lr 0.000078 wd 0.0500 time 0.5840 (0.6037) data time 0.0008 (0.0022) model time 0.5832 (0.5804) loss 5.8010 (6.8807) grad_norm 2.6579 (2.7475) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:51:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][460/625] eta 0:01:39 lr 0.000078 wd 0.0500 time 0.6373 (0.6033) data time 0.0006 (0.0022) model time 0.6367 (0.5805) loss 7.0308 (6.8761) grad_norm 2.2962 (2.7357) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:51:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][470/625] eta 0:01:33 lr 0.000077 wd 0.0500 time 0.5798 (0.6029) data time 0.0007 (0.0022) model time 0.5791 (0.5805) loss 6.9080 (6.8710) grad_norm 2.5362 (2.7354) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:51:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][480/625] eta 0:01:27 lr 0.000077 wd 0.0500 time 0.5758 (0.6023) data time 0.0009 (0.0022) model time 0.5748 (0.5804) loss 7.2483 (6.8643) grad_norm 2.0325 (2.7347) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:51:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][490/625] eta 0:01:21 lr 0.000077 wd 0.0500 time 0.5774 (0.6018) data time 0.0009 (0.0021) model time 0.5765 (0.5803) loss 7.2070 (6.8646) grad_norm 5.0721 (2.7376) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:52:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][500/625] eta 0:01:15 lr 0.000077 wd 0.0500 time 0.5811 (0.6014) data time 0.0006 (0.0021) model time 0.5805 (0.5803) loss 6.7798 (6.8687) grad_norm 2.7200 (2.7298) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:52:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][510/625] eta 0:01:09 lr 0.000077 wd 0.0500 time 0.5822 (0.6016) data time 0.0006 (0.0021) model time 0.5816 (0.5809) loss 6.3078 (6.8676) grad_norm 3.4717 (2.7295) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:52:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][520/625] eta 0:01:03 lr 0.000077 wd 0.0500 time 0.5801 (0.6012) data time 0.0009 (0.0021) model time 0.5792 (0.5810) loss 8.6421 (6.8803) grad_norm 3.4306 (2.7276) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:52:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][530/625] eta 0:00:57 lr 0.000077 wd 0.0500 time 0.6037 (0.6009) data time 0.0007 (0.0021) model time 0.6031 (0.5810) loss 6.7902 (6.8821) grad_norm 2.9265 (2.7254) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:52:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][540/625] eta 0:00:51 lr 0.000077 wd 0.0500 time 0.5824 (0.6005) data time 0.0008 (0.0020) model time 0.5816 (0.5810) loss 6.9537 (6.8767) grad_norm 2.8389 (2.7223) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:52:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][550/625] eta 0:00:45 lr 0.000077 wd 0.0500 time 0.5788 (0.6001) data time 0.0006 (0.0020) model time 0.5782 (0.5809) loss 6.7555 (6.8730) grad_norm 2.0567 (2.7283) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:52:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][560/625] eta 0:00:38 lr 0.000077 wd 0.0500 time 0.5773 (0.5997) data time 0.0008 (0.0020) model time 0.5765 (0.5808) loss 5.5007 (6.8715) grad_norm 3.1265 (2.7265) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:52:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][570/625] eta 0:00:32 lr 0.000077 wd 0.0500 time 0.5837 (0.5993) data time 0.0008 (0.0020) model time 0.5829 (0.5807) loss 7.3222 (6.8798) grad_norm 2.6732 (2.7408) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:52:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][580/625] eta 0:00:26 lr 0.000077 wd 0.0500 time 0.5754 (0.5989) data time 0.0008 (0.0020) model time 0.5746 (0.5806) loss 6.7657 (6.8835) grad_norm 1.8166 (2.7451) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:52:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][590/625] eta 0:00:20 lr 0.000077 wd 0.0500 time 0.5809 (0.5986) data time 0.0007 (0.0019) model time 0.5802 (0.5806) loss 6.5527 (6.8844) grad_norm 1.9550 (2.7428) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:52:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][600/625] eta 0:00:14 lr 0.000077 wd 0.0500 time 0.5807 (0.5983) data time 0.0007 (0.0019) model time 0.5800 (0.5806) loss 6.9074 (6.8874) grad_norm 2.6710 (2.7669) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:53:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][610/625] eta 0:00:08 lr 0.000077 wd 0.0500 time 0.5797 (0.5980) data time 0.0006 (0.0019) model time 0.5791 (0.5806) loss 7.1107 (6.8914) grad_norm 2.7037 (2.7696) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:53:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [257/300][620/625] eta 0:00:02 lr 0.000077 wd 0.0500 time 0.5792 (0.5977) data time 0.0005 (0.0019) model time 0.5787 (0.5806) loss 6.8648 (6.8867) grad_norm 2.1380 (2.7730) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:53:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 257 training takes 0:06:06 +[2024-07-28 03:53:13 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 03:53:18 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 03:53:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.456 (0.456) Loss 0.4844 (0.4844) Acc@1 90.381 (90.381) Acc@5 98.975 (98.975) Mem 22341MB +[2024-07-28 03:53:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.157) Loss 0.7368 (0.5940) Acc@1 83.301 (88.068) Acc@5 97.021 (98.211) Mem 22341MB +[2024-07-28 03:53:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8115 (0.6776) Acc@1 81.250 (85.568) Acc@5 96.094 (97.382) Mem 22341MB +[2024-07-28 03:53:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.199 Acc@5 97.383 +[2024-07-28 03:53:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.2% +[2024-07-28 03:53:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 85.20% +[2024-07-28 03:53:23 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-28 03:53:27 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-28 03:53:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.459 (0.459) Loss 0.4993 (0.4993) Acc@1 90.381 (90.381) Acc@5 98.975 (98.975) Mem 22341MB +[2024-07-28 03:53:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.155) Loss 0.7417 (0.6064) Acc@1 83.105 (88.130) Acc@5 97.070 (98.167) Mem 22341MB +[2024-07-28 03:53:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.140) Loss 0.8325 (0.6910) Acc@1 80.713 (85.417) Acc@5 96.289 (97.375) Mem 22341MB +[2024-07-28 03:53:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.025 Acc@5 97.373 +[2024-07-28 03:53:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.0% +[2024-07-28 03:53:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.02% +[2024-07-28 03:53:31 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-28 03:53:32 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-28 03:53:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][0/625] eta 0:12:38 lr 0.000077 wd 0.0500 time 1.2140 (1.2140) data time 0.3664 (0.3664) model time 0.0000 (0.0000) loss 7.1720 (7.1720) grad_norm 2.9368 (2.9368) loss_scale 128.0000 (128.0000) mem 22337MB +[2024-07-28 03:53:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][10/625] eta 0:06:34 lr 0.000077 wd 0.0500 time 0.5720 (0.6415) data time 0.0008 (0.0346) model time 0.0000 (0.0000) loss 7.4276 (7.2266) grad_norm 2.0670 (2.9283) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:53:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][20/625] eta 0:06:10 lr 0.000077 wd 0.0500 time 0.5738 (0.6119) data time 0.0008 (0.0185) model time 0.0000 (0.0000) loss 6.4105 (7.0993) grad_norm 1.8813 (2.6134) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:53:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][30/625] eta 0:05:57 lr 0.000077 wd 0.0500 time 0.5814 (0.6004) data time 0.0006 (0.0129) model time 0.0000 (0.0000) loss 5.9115 (6.8529) grad_norm 3.3889 (2.6025) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:53:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][40/625] eta 0:05:47 lr 0.000077 wd 0.0500 time 0.5748 (0.5939) data time 0.0006 (0.0099) model time 0.0000 (0.0000) loss 6.2936 (6.8506) grad_norm 2.8864 (2.5763) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:54:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][50/625] eta 0:05:39 lr 0.000077 wd 0.0500 time 0.5730 (0.5903) data time 0.0006 (0.0081) model time 0.0000 (0.0000) loss 6.6440 (6.7688) grad_norm 4.8845 (2.6268) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:54:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][60/625] eta 0:05:32 lr 0.000076 wd 0.0500 time 0.5755 (0.5890) data time 0.0007 (0.0069) model time 0.5749 (0.5818) loss 6.6201 (6.8263) grad_norm 2.7141 (2.6125) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:54:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][70/625] eta 0:05:25 lr 0.000076 wd 0.0500 time 0.5791 (0.5872) data time 0.0008 (0.0062) model time 0.5783 (0.5781) loss 7.2248 (6.8124) grad_norm 1.8099 (2.5542) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:54:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][80/625] eta 0:05:19 lr 0.000076 wd 0.0500 time 0.5769 (0.5863) data time 0.0008 (0.0057) model time 0.5761 (0.5782) loss 7.4687 (6.8191) grad_norm 2.5310 (2.5123) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:54:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][90/625] eta 0:05:13 lr 0.000076 wd 0.0500 time 0.5787 (0.5856) data time 0.0006 (0.0051) model time 0.5781 (0.5783) loss 6.9482 (6.8146) grad_norm 2.2259 (2.5239) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 03:54:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-28 03:54:27 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 03:54:28 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 03:56:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-28 03:56:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-28 03:56:38 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-28 03:56:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-28 03:56:49 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-28 03:56:50 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-28 03:56:50 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-28 03:56:50 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 258) +[2024-07-28 03:56:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-28 03:57:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][100/625] eta 0:16:49 lr 0.000076 wd 0.0500 time 0.5722 (1.9232) data time 0.0007 (0.0889) model time 0.5716 (1.8344) loss 6.9155 (7.2151) grad_norm 2.8711 (3.2084) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:57:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][110/625] eta 0:10:03 lr 0.000076 wd 0.0500 time 0.5724 (1.1727) data time 0.0006 (0.0400) model time 0.5718 (1.1328) loss 6.5802 (7.0159) grad_norm 2.4347 (3.2373) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:57:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][120/625] eta 0:08:03 lr 0.000076 wd 0.0500 time 0.5741 (0.9579) data time 0.0009 (0.0260) model time 0.5733 (0.9319) loss 7.7505 (7.0812) grad_norm 2.2937 (3.1418) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:57:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][130/625] eta 0:07:03 lr 0.000076 wd 0.0500 time 0.5731 (0.8565) data time 0.0008 (0.0194) model time 0.5723 (0.8371) loss 6.4047 (6.9611) grad_norm 2.6771 (2.8985) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 03:57:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-28 03:57:28 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 03:57:33 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 04:01:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-28 04:01:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-28 04:02:04 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-28 04:02:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-28 04:02:16 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-28 04:02:17 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-28 04:02:17 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-28 04:02:17 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 258) +[2024-07-28 04:02:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-28 04:02:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][140/625] eta 0:14:16 lr 0.000076 wd 0.0500 time 0.5764 (1.7656) data time 0.0007 (0.0781) model time 0.5757 (1.6874) loss 7.2230 (7.0211) grad_norm 2.4952 (2.4977) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:02:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][150/625] eta 0:08:44 lr 0.000076 wd 0.0500 time 0.5767 (1.1052) data time 0.0007 (0.0358) model time 0.5760 (1.0694) loss 7.1808 (7.0622) grad_norm 3.0640 (2.3937) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:02:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][160/625] eta 0:07:05 lr 0.000076 wd 0.0500 time 0.5731 (0.9156) data time 0.0008 (0.0234) model time 0.5723 (0.8922) loss 7.5092 (7.0676) grad_norm 2.2753 (2.3774) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:02:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][170/625] eta 0:06:16 lr 0.000076 wd 0.0500 time 0.5749 (0.8274) data time 0.0009 (0.0175) model time 0.5740 (0.8100) loss 6.5803 (6.9895) grad_norm 2.2356 (2.3053) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:02:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][180/625] eta 0:05:45 lr 0.000076 wd 0.0500 time 0.5999 (0.7755) data time 0.0009 (0.0140) model time 0.5991 (0.7615) loss 6.5572 (6.9821) grad_norm 2.7474 (2.3469) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:03:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][190/625] eta 0:05:24 lr 0.000076 wd 0.0500 time 0.5809 (0.7471) data time 0.0007 (0.0120) model time 0.5802 (0.7351) loss 7.1028 (6.9716) grad_norm 3.2106 (2.6320) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:03:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][200/625] eta 0:05:07 lr 0.000076 wd 0.0500 time 0.5818 (0.7226) data time 0.0006 (0.0104) model time 0.5812 (0.7123) loss 6.4086 (6.9675) grad_norm 4.7178 (2.6454) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:03:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][210/625] eta 0:04:52 lr 0.000076 wd 0.0500 time 0.5799 (0.7042) data time 0.0006 (0.0092) model time 0.5793 (0.6950) loss 6.3877 (6.9466) grad_norm 1.9151 (2.6031) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:03:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][220/625] eta 0:04:39 lr 0.000076 wd 0.0500 time 0.5840 (0.6910) data time 0.0009 (0.0082) model time 0.5831 (0.6828) loss 7.7433 (6.9464) grad_norm 1.9554 (2.5903) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:03:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][230/625] eta 0:04:28 lr 0.000076 wd 0.0500 time 0.5793 (0.6796) data time 0.0007 (0.0075) model time 0.5786 (0.6722) loss 7.4464 (6.9517) grad_norm 2.9656 (2.5680) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:03:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][240/625] eta 0:04:18 lr 0.000076 wd 0.0500 time 0.6134 (0.6705) data time 0.0006 (0.0070) model time 0.6128 (0.6635) loss 5.8086 (6.9494) grad_norm 2.4639 (2.5737) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:03:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][250/625] eta 0:04:08 lr 0.000076 wd 0.0500 time 0.5777 (0.6632) data time 0.0008 (0.0066) model time 0.5769 (0.6566) loss 6.6861 (6.9444) grad_norm 3.0733 (2.5374) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:03:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][260/625] eta 0:03:59 lr 0.000075 wd 0.0500 time 0.5761 (0.6570) data time 0.0007 (0.0061) model time 0.5754 (0.6509) loss 7.0741 (6.9397) grad_norm 2.1426 (2.5638) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:03:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][270/625] eta 0:03:51 lr 0.000075 wd 0.0500 time 0.5790 (0.6513) data time 0.0008 (0.0058) model time 0.5782 (0.6455) loss 6.4678 (6.9075) grad_norm 1.6745 (2.5837) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:03:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][280/625] eta 0:03:43 lr 0.000075 wd 0.0500 time 0.5816 (0.6464) data time 0.0006 (0.0054) model time 0.5810 (0.6410) loss 6.6639 (6.9202) grad_norm 2.0178 (2.5724) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:04:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][290/625] eta 0:03:35 lr 0.000075 wd 0.0500 time 0.5800 (0.6424) data time 0.0007 (0.0051) model time 0.5793 (0.6372) loss 5.8583 (6.9240) grad_norm 3.3011 (2.5903) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:04:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][300/625] eta 0:03:27 lr 0.000075 wd 0.0500 time 0.5967 (0.6392) data time 0.0008 (0.0049) model time 0.5959 (0.6344) loss 8.2487 (6.9534) grad_norm 1.8614 (2.5834) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:04:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][310/625] eta 0:03:20 lr 0.000075 wd 0.0500 time 0.5873 (0.6358) data time 0.0006 (0.0047) model time 0.5867 (0.6312) loss 5.7852 (6.9427) grad_norm 2.6922 (2.6605) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:04:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][320/625] eta 0:03:12 lr 0.000075 wd 0.0500 time 0.5764 (0.6327) data time 0.0006 (0.0045) model time 0.5757 (0.6282) loss 6.9291 (6.9517) grad_norm 4.0836 (2.6747) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:04:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][330/625] eta 0:03:05 lr 0.000075 wd 0.0500 time 0.5773 (0.6299) data time 0.0008 (0.0043) model time 0.5764 (0.6256) loss 5.6441 (6.9355) grad_norm 2.4881 (2.6806) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:04:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][340/625] eta 0:02:58 lr 0.000075 wd 0.0500 time 0.5797 (0.6274) data time 0.0008 (0.0041) model time 0.5788 (0.6233) loss 8.0705 (6.9284) grad_norm 2.5741 (2.6778) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:04:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][350/625] eta 0:02:51 lr 0.000075 wd 0.0500 time 0.5874 (0.6252) data time 0.0008 (0.0040) model time 0.5865 (0.6213) loss 7.0707 (6.9196) grad_norm 3.5791 (2.6791) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:04:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][360/625] eta 0:02:45 lr 0.000075 wd 0.0500 time 0.5816 (0.6233) data time 0.0009 (0.0038) model time 0.5807 (0.6195) loss 8.6262 (6.9164) grad_norm 4.7335 (2.7883) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:04:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][370/625] eta 0:02:38 lr 0.000075 wd 0.0500 time 0.5836 (0.6216) data time 0.0007 (0.0037) model time 0.5829 (0.6179) loss 7.0461 (6.9106) grad_norm 2.9641 (2.7752) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:04:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][380/625] eta 0:02:31 lr 0.000075 wd 0.0500 time 0.5788 (0.6198) data time 0.0009 (0.0036) model time 0.5779 (0.6162) loss 6.5177 (6.9158) grad_norm 2.5386 (2.7630) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:05:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][390/625] eta 0:02:25 lr 0.000075 wd 0.0500 time 0.5918 (0.6182) data time 0.0009 (0.0035) model time 0.5909 (0.6147) loss 7.3755 (6.8988) grad_norm 2.4130 (2.7668) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:05:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][400/625] eta 0:02:18 lr 0.000075 wd 0.0500 time 0.5786 (0.6167) data time 0.0007 (0.0034) model time 0.5779 (0.6134) loss 7.6791 (6.8904) grad_norm 2.0098 (2.7513) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:05:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][410/625] eta 0:02:12 lr 0.000075 wd 0.0500 time 0.5793 (0.6161) data time 0.0008 (0.0033) model time 0.5784 (0.6128) loss 5.9286 (6.8860) grad_norm 2.5076 (2.7394) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:05:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][420/625] eta 0:02:06 lr 0.000075 wd 0.0500 time 0.5837 (0.6150) data time 0.0006 (0.0032) model time 0.5831 (0.6118) loss 6.8300 (6.8884) grad_norm 2.8566 (2.7295) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:05:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][430/625] eta 0:01:59 lr 0.000075 wd 0.0500 time 0.6121 (0.6140) data time 0.0008 (0.0031) model time 0.6113 (0.6108) loss 5.9133 (6.8703) grad_norm 2.3696 (2.7194) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:05:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][440/625] eta 0:01:53 lr 0.000075 wd 0.0500 time 0.6261 (0.6131) data time 0.0006 (0.0031) model time 0.6255 (0.6100) loss 6.1156 (6.8658) grad_norm 19.7656 (2.7695) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:05:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][450/625] eta 0:01:47 lr 0.000075 wd 0.0500 time 0.5887 (0.6121) data time 0.0009 (0.0030) model time 0.5879 (0.6090) loss 6.4420 (6.8825) grad_norm 2.9330 (2.7733) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:05:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][460/625] eta 0:01:40 lr 0.000075 wd 0.0500 time 0.5927 (0.6112) data time 0.0007 (0.0030) model time 0.5920 (0.6082) loss 6.7281 (6.8943) grad_norm 3.2361 (2.7781) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:05:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][470/625] eta 0:01:34 lr 0.000074 wd 0.0500 time 0.5788 (0.6102) data time 0.0008 (0.0029) model time 0.5780 (0.6072) loss 6.8646 (6.8888) grad_norm 3.5893 (2.8052) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:05:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][480/625] eta 0:01:28 lr 0.000074 wd 0.0500 time 0.5770 (0.6092) data time 0.0006 (0.0029) model time 0.5763 (0.6063) loss 6.2302 (6.8866) grad_norm 6.2604 (2.8132) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:05:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][490/625] eta 0:01:22 lr 0.000074 wd 0.0500 time 0.5939 (0.6084) data time 0.0008 (0.0028) model time 0.5931 (0.6056) loss 7.0047 (6.8851) grad_norm 3.2250 (2.8116) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:06:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][500/625] eta 0:01:15 lr 0.000074 wd 0.0500 time 0.5797 (0.6077) data time 0.0007 (0.0028) model time 0.5790 (0.6049) loss 6.0700 (6.8797) grad_norm 1.8793 (2.7959) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:06:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][510/625] eta 0:01:09 lr 0.000074 wd 0.0500 time 0.5836 (0.6070) data time 0.0009 (0.0027) model time 0.5827 (0.6043) loss 7.3265 (6.8858) grad_norm 2.3357 (2.7915) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:06:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][520/625] eta 0:01:03 lr 0.000074 wd 0.0500 time 0.5809 (0.6063) data time 0.0007 (0.0027) model time 0.5802 (0.6036) loss 6.4014 (6.8808) grad_norm 2.0393 (2.7825) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:06:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][530/625] eta 0:00:57 lr 0.000074 wd 0.0500 time 0.5746 (0.6059) data time 0.0006 (0.0026) model time 0.5740 (0.6032) loss 6.5240 (6.8804) grad_norm 3.0523 (2.7832) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:06:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][540/625] eta 0:00:51 lr 0.000074 wd 0.0500 time 0.5766 (0.6052) data time 0.0008 (0.0026) model time 0.5757 (0.6026) loss 6.7969 (6.8855) grad_norm 2.3017 (2.7738) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:06:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][550/625] eta 0:00:45 lr 0.000074 wd 0.0500 time 0.5831 (0.6045) data time 0.0007 (0.0025) model time 0.5824 (0.6020) loss 8.4265 (6.8852) grad_norm 2.2031 (2.7677) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:06:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][560/625] eta 0:00:39 lr 0.000074 wd 0.0500 time 0.5781 (0.6041) data time 0.0006 (0.0025) model time 0.5774 (0.6016) loss 8.1701 (6.8863) grad_norm 2.4657 (2.7576) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:06:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][570/625] eta 0:00:33 lr 0.000074 wd 0.0500 time 0.5810 (0.6036) data time 0.0009 (0.0025) model time 0.5802 (0.6011) loss 6.9328 (6.8893) grad_norm 2.3425 (2.7516) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:06:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][580/625] eta 0:00:27 lr 0.000074 wd 0.0500 time 0.6032 (0.6031) data time 0.0009 (0.0024) model time 0.6023 (0.6006) loss 6.1779 (6.8805) grad_norm 3.0075 (2.7420) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:06:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][590/625] eta 0:00:21 lr 0.000074 wd 0.0500 time 0.5765 (0.6025) data time 0.0008 (0.0024) model time 0.5757 (0.6001) loss 6.1032 (6.8731) grad_norm 3.2944 (2.8167) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:07:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][600/625] eta 0:00:15 lr 0.000074 wd 0.0500 time 0.5813 (0.6020) data time 0.0010 (0.0024) model time 0.5803 (0.5997) loss 7.3948 (6.8622) grad_norm 1.7556 (2.8077) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:07:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][610/625] eta 0:00:09 lr 0.000074 wd 0.0500 time 0.5724 (0.6015) data time 0.0007 (0.0024) model time 0.5717 (0.5992) loss 6.1428 (6.8606) grad_norm 2.6565 (2.8162) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:07:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [258/300][620/625] eta 0:00:03 lr 0.000074 wd 0.0500 time 0.5772 (0.6011) data time 0.0007 (0.0023) model time 0.5766 (0.5988) loss 7.4673 (6.8674) grad_norm 2.6911 (2.8173) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-28 04:07:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 258 training takes 0:04:55 +[2024-07-28 04:07:16 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 04:07:20 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 04:07:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.468 (0.468) Loss 0.4966 (0.4966) Acc@1 90.527 (90.527) Acc@5 99.023 (99.023) Mem 22341MB +[2024-07-28 04:07:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.7402 (0.6017) Acc@1 83.203 (88.077) Acc@5 97.266 (98.189) Mem 22341MB +[2024-07-28 04:07:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.124 (0.142) Loss 0.8330 (0.6881) Acc@1 80.469 (85.428) Acc@5 96.094 (97.377) Mem 22341MB +[2024-07-28 04:07:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.047 Acc@5 97.379 +[2024-07-28 04:07:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.0% +[2024-07-28 04:07:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.765 (0.765) Loss 0.4993 (0.4993) Acc@1 90.381 (90.381) Acc@5 98.926 (98.926) Mem 22341MB +[2024-07-28 04:07:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.185) Loss 0.7402 (0.6060) Acc@1 83.105 (88.148) Acc@5 97.119 (98.171) Mem 22341MB +[2024-07-28 04:07:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.156) Loss 0.8315 (0.6905) Acc@1 80.713 (85.435) Acc@5 96.191 (97.370) Mem 22341MB +[2024-07-28 04:07:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.041 Acc@5 97.369 +[2024-07-28 04:07:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.0% +[2024-07-28 04:07:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.04% +[2024-07-28 04:07:30 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-28 04:07:36 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-28 04:07:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][0/625] eta 0:12:23 lr 0.000074 wd 0.0500 time 1.1904 (1.1904) data time 0.3464 (0.3464) model time 0.0000 (0.0000) loss 6.2523 (6.2523) grad_norm 2.9239 (2.9239) loss_scale 128.0000 (128.0000) mem 22337MB +[2024-07-28 04:07:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][10/625] eta 0:06:39 lr 0.000074 wd 0.0500 time 0.5860 (0.6501) data time 0.0007 (0.0322) model time 0.0000 (0.0000) loss 5.9933 (6.6380) grad_norm 2.7283 (2.9051) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:07:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][20/625] eta 0:06:12 lr 0.000074 wd 0.0500 time 0.5770 (0.6154) data time 0.0006 (0.0173) model time 0.0000 (0.0000) loss 6.9483 (6.9278) grad_norm 3.8164 (2.7562) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:07:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][30/625] eta 0:05:58 lr 0.000074 wd 0.0500 time 0.5756 (0.6026) data time 0.0009 (0.0120) model time 0.0000 (0.0000) loss 7.2760 (6.8753) grad_norm 2.2288 (2.7076) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:08:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][40/625] eta 0:05:49 lr 0.000074 wd 0.0500 time 0.5812 (0.5966) data time 0.0007 (0.0093) model time 0.0000 (0.0000) loss 7.4410 (6.8496) grad_norm 2.7455 (2.6547) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:08:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][50/625] eta 0:05:41 lr 0.000074 wd 0.0500 time 0.5745 (0.5936) data time 0.0009 (0.0076) model time 0.0000 (0.0000) loss 6.4509 (6.7962) grad_norm 3.1671 (2.6218) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:08:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][60/625] eta 0:05:34 lr 0.000073 wd 0.0500 time 0.5763 (0.5917) data time 0.0007 (0.0065) model time 0.5756 (0.5813) loss 7.2501 (6.8190) grad_norm 2.0666 (2.6169) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:08:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][70/625] eta 0:05:27 lr 0.000073 wd 0.0500 time 0.5792 (0.5897) data time 0.0009 (0.0057) model time 0.5783 (0.5788) loss 6.6001 (6.8555) grad_norm 2.0137 (2.5744) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:08:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][80/625] eta 0:05:20 lr 0.000073 wd 0.0500 time 0.5967 (0.5883) data time 0.0006 (0.0051) model time 0.5961 (0.5783) loss 6.3252 (6.8467) grad_norm 4.6049 (2.6465) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:08:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][90/625] eta 0:05:14 lr 0.000073 wd 0.0500 time 0.5777 (0.5871) data time 0.0009 (0.0048) model time 0.5768 (0.5777) loss 7.1302 (6.8699) grad_norm 2.5309 (2.6121) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:08:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][100/625] eta 0:05:08 lr 0.000073 wd 0.0500 time 0.6003 (0.5867) data time 0.0009 (0.0044) model time 0.5994 (0.5786) loss 5.8560 (6.8683) grad_norm 2.1776 (2.6191) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:08:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][110/625] eta 0:05:01 lr 0.000073 wd 0.0500 time 0.5795 (0.5859) data time 0.0008 (0.0041) model time 0.5787 (0.5783) loss 6.6433 (6.8431) grad_norm 2.0826 (2.6130) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:08:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][120/625] eta 0:04:55 lr 0.000073 wd 0.0500 time 0.5788 (0.5850) data time 0.0009 (0.0038) model time 0.5780 (0.5778) loss 7.7975 (6.8730) grad_norm 2.2121 (2.5849) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:08:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][130/625] eta 0:04:49 lr 0.000073 wd 0.0500 time 0.5792 (0.5845) data time 0.0006 (0.0036) model time 0.5785 (0.5777) loss 7.8324 (6.9041) grad_norm 2.4135 (2.5679) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:08:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][140/625] eta 0:04:43 lr 0.000073 wd 0.0500 time 0.5785 (0.5839) data time 0.0007 (0.0034) model time 0.5778 (0.5775) loss 6.8587 (6.9224) grad_norm 1.8966 (2.5407) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:09:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][150/625] eta 0:04:37 lr 0.000073 wd 0.0500 time 0.5757 (0.5835) data time 0.0007 (0.0032) model time 0.5751 (0.5774) loss 6.5002 (6.8983) grad_norm 2.6933 (2.5283) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:09:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][160/625] eta 0:04:31 lr 0.000073 wd 0.0500 time 0.5791 (0.5831) data time 0.0006 (0.0031) model time 0.5785 (0.5773) loss 6.9913 (6.8795) grad_norm 2.1594 (2.5138) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:09:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][170/625] eta 0:04:25 lr 0.000073 wd 0.0500 time 0.5784 (0.5834) data time 0.0008 (0.0030) model time 0.5776 (0.5781) loss 6.4677 (6.8543) grad_norm 2.6667 (2.5018) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:09:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][180/625] eta 0:04:19 lr 0.000073 wd 0.0500 time 0.5738 (0.5829) data time 0.0006 (0.0028) model time 0.5731 (0.5778) loss 7.6247 (6.8625) grad_norm 4.3126 (2.5116) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:09:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][190/625] eta 0:04:13 lr 0.000073 wd 0.0500 time 0.5747 (0.5825) data time 0.0006 (0.0027) model time 0.5741 (0.5775) loss 6.8779 (6.8702) grad_norm 2.3004 (2.5482) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:09:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][200/625] eta 0:04:07 lr 0.000073 wd 0.0500 time 0.5767 (0.5822) data time 0.0009 (0.0026) model time 0.5758 (0.5773) loss 8.0755 (6.8702) grad_norm 4.1120 (2.5753) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:09:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][210/625] eta 0:04:01 lr 0.000073 wd 0.0500 time 0.5789 (0.5819) data time 0.0007 (0.0026) model time 0.5782 (0.5772) loss 6.6763 (6.8577) grad_norm 2.4897 (2.5974) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:09:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][220/625] eta 0:03:55 lr 0.000073 wd 0.0500 time 0.5778 (0.5817) data time 0.0006 (0.0025) model time 0.5772 (0.5772) loss 6.8622 (6.8593) grad_norm 2.1393 (2.5978) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:09:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][230/625] eta 0:03:49 lr 0.000073 wd 0.0500 time 0.5782 (0.5823) data time 0.0006 (0.0024) model time 0.5776 (0.5781) loss 6.6784 (6.8570) grad_norm 2.1533 (2.6180) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:09:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][240/625] eta 0:03:44 lr 0.000073 wd 0.0500 time 0.5794 (0.5821) data time 0.0009 (0.0024) model time 0.5785 (0.5780) loss 7.0919 (6.8583) grad_norm 1.6164 (2.6118) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:10:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][250/625] eta 0:03:38 lr 0.000073 wd 0.0500 time 0.5777 (0.5820) data time 0.0008 (0.0023) model time 0.5768 (0.5781) loss 8.0019 (6.8774) grad_norm 3.0883 (2.6102) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:10:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][260/625] eta 0:03:32 lr 0.000073 wd 0.0500 time 0.5873 (0.5818) data time 0.0008 (0.0022) model time 0.5864 (0.5780) loss 5.8195 (6.8636) grad_norm 2.2240 (2.6037) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:10:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][270/625] eta 0:03:26 lr 0.000072 wd 0.0500 time 0.5770 (0.5817) data time 0.0006 (0.0022) model time 0.5764 (0.5780) loss 6.1173 (6.8658) grad_norm 1.9098 (2.5979) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:10:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][280/625] eta 0:03:20 lr 0.000072 wd 0.0500 time 0.5782 (0.5814) data time 0.0006 (0.0021) model time 0.5776 (0.5778) loss 7.8537 (6.8732) grad_norm 2.5709 (2.6211) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:10:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][290/625] eta 0:03:14 lr 0.000072 wd 0.0500 time 0.5788 (0.5814) data time 0.0008 (0.0021) model time 0.5779 (0.5779) loss 7.0475 (6.8836) grad_norm 2.2015 (2.7287) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:10:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][300/625] eta 0:03:08 lr 0.000072 wd 0.0500 time 0.5837 (0.5814) data time 0.0009 (0.0021) model time 0.5828 (0.5780) loss 7.2801 (6.8831) grad_norm 1.6470 (2.7117) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:10:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][310/625] eta 0:03:03 lr 0.000072 wd 0.0500 time 0.5802 (0.5813) data time 0.0008 (0.0020) model time 0.5794 (0.5779) loss 6.8190 (6.8856) grad_norm 1.9084 (2.6976) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:10:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][320/625] eta 0:02:57 lr 0.000072 wd 0.0500 time 0.5762 (0.5813) data time 0.0006 (0.0020) model time 0.5756 (0.5780) loss 5.6279 (6.8737) grad_norm 2.5391 (2.7048) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:10:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][330/625] eta 0:02:51 lr 0.000072 wd 0.0500 time 0.5912 (0.5813) data time 0.0009 (0.0020) model time 0.5903 (0.5781) loss 5.8954 (6.8746) grad_norm 3.3694 (2.8107) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:10:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][340/625] eta 0:02:45 lr 0.000072 wd 0.0500 time 0.5755 (0.5811) data time 0.0006 (0.0019) model time 0.5749 (0.5780) loss 6.4623 (6.8693) grad_norm 2.0846 (2.8339) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:11:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][350/625] eta 0:02:39 lr 0.000072 wd 0.0500 time 0.5853 (0.5810) data time 0.0006 (0.0019) model time 0.5847 (0.5779) loss 7.8089 (6.8712) grad_norm 1.9636 (2.8301) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:11:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][360/625] eta 0:02:33 lr 0.000072 wd 0.0500 time 0.5770 (0.5809) data time 0.0007 (0.0019) model time 0.5764 (0.5779) loss 7.0716 (6.8623) grad_norm 2.8528 (2.8220) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:11:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][370/625] eta 0:02:28 lr 0.000072 wd 0.0500 time 0.5758 (0.5809) data time 0.0009 (0.0019) model time 0.5750 (0.5780) loss 5.7485 (6.8610) grad_norm 2.9073 (2.8172) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:11:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][380/625] eta 0:02:22 lr 0.000072 wd 0.0500 time 0.5930 (0.5809) data time 0.0006 (0.0018) model time 0.5923 (0.5780) loss 6.6495 (6.8621) grad_norm 2.7778 (2.8100) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:11:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][390/625] eta 0:02:16 lr 0.000072 wd 0.0500 time 0.5769 (0.5812) data time 0.0010 (0.0018) model time 0.5759 (0.5784) loss 7.2405 (6.8558) grad_norm 1.8106 (2.8060) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:11:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][400/625] eta 0:02:10 lr 0.000072 wd 0.0500 time 0.5760 (0.5811) data time 0.0007 (0.0018) model time 0.5753 (0.5784) loss 6.9299 (6.8566) grad_norm 1.6357 (2.7959) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-28 04:11:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][410/625] eta 0:02:04 lr 0.000072 wd 0.0500 time 0.5773 (0.5810) data time 0.0009 (0.0018) model time 0.5765 (0.5783) loss 8.0005 (6.8614) grad_norm 1.9523 (2.7867) loss_scale 256.0000 (130.4915) mem 22339MB +[2024-07-28 04:11:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][420/625] eta 0:01:59 lr 0.000072 wd 0.0500 time 0.5759 (0.5809) data time 0.0006 (0.0017) model time 0.5752 (0.5782) loss 7.4830 (6.8611) grad_norm 3.2668 (2.7840) loss_scale 256.0000 (133.4727) mem 22339MB +[2024-07-28 04:11:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][430/625] eta 0:01:53 lr 0.000072 wd 0.0500 time 0.5772 (0.5808) data time 0.0009 (0.0017) model time 0.5763 (0.5781) loss 6.9083 (6.8729) grad_norm 1.9908 (2.7697) loss_scale 256.0000 (136.3155) mem 22339MB +[2024-07-28 04:11:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][440/625] eta 0:01:47 lr 0.000072 wd 0.0500 time 0.5807 (0.5807) data time 0.0007 (0.0017) model time 0.5800 (0.5781) loss 5.9781 (6.8787) grad_norm 4.1395 (2.8348) loss_scale 256.0000 (139.0295) mem 22339MB +[2024-07-28 04:11:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][450/625] eta 0:01:41 lr 0.000072 wd 0.0500 time 0.5812 (0.5811) data time 0.0008 (0.0017) model time 0.5804 (0.5786) loss 5.7913 (6.8708) grad_norm 2.1601 (2.8592) loss_scale 256.0000 (141.6231) mem 22339MB +[2024-07-28 04:12:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][460/625] eta 0:01:35 lr 0.000072 wd 0.0500 time 0.5756 (0.5810) data time 0.0006 (0.0017) model time 0.5749 (0.5785) loss 6.7553 (6.8655) grad_norm 2.5275 (2.8721) loss_scale 256.0000 (144.1041) mem 22339MB +[2024-07-28 04:12:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][470/625] eta 0:01:30 lr 0.000072 wd 0.0500 time 0.5779 (0.5810) data time 0.0009 (0.0016) model time 0.5770 (0.5785) loss 6.7316 (6.8676) grad_norm 3.5595 (2.8711) loss_scale 256.0000 (146.4798) mem 22339MB +[2024-07-28 04:12:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][480/625] eta 0:01:24 lr 0.000071 wd 0.0500 time 0.5795 (0.5809) data time 0.0007 (0.0016) model time 0.5789 (0.5784) loss 6.3599 (6.8701) grad_norm 2.1054 (2.8646) loss_scale 256.0000 (148.7568) mem 22339MB +[2024-07-28 04:12:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][490/625] eta 0:01:18 lr 0.000071 wd 0.0500 time 0.5786 (0.5809) data time 0.0006 (0.0016) model time 0.5780 (0.5785) loss 5.8097 (6.8656) grad_norm 2.6161 (2.8803) loss_scale 256.0000 (150.9409) mem 22339MB +[2024-07-28 04:12:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][500/625] eta 0:01:12 lr 0.000071 wd 0.0500 time 0.5759 (0.5808) data time 0.0006 (0.0016) model time 0.5753 (0.5784) loss 6.0605 (6.8698) grad_norm 7.6238 (2.8889) loss_scale 256.0000 (153.0379) mem 22339MB +[2024-07-28 04:12:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][510/625] eta 0:01:06 lr 0.000071 wd 0.0500 time 0.5767 (0.5807) data time 0.0006 (0.0016) model time 0.5761 (0.5784) loss 6.7577 (6.8646) grad_norm 2.3744 (2.8766) loss_scale 256.0000 (155.0528) mem 22339MB +[2024-07-28 04:12:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][520/625] eta 0:01:00 lr 0.000071 wd 0.0500 time 0.5923 (0.5807) data time 0.0009 (0.0016) model time 0.5915 (0.5784) loss 7.3933 (6.8722) grad_norm 2.0180 (2.8942) loss_scale 256.0000 (156.9904) mem 22339MB +[2024-07-28 04:12:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][530/625] eta 0:00:55 lr 0.000071 wd 0.0500 time 0.5784 (0.5806) data time 0.0006 (0.0016) model time 0.5778 (0.5783) loss 5.2656 (6.8664) grad_norm 2.4968 (2.9198) loss_scale 256.0000 (158.8550) mem 22339MB +[2024-07-28 04:12:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][540/625] eta 0:00:49 lr 0.000071 wd 0.0500 time 0.5835 (0.5806) data time 0.0007 (0.0016) model time 0.5829 (0.5783) loss 5.8427 (6.8627) grad_norm 2.5184 (2.9125) loss_scale 256.0000 (160.6506) mem 22339MB +[2024-07-28 04:12:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][550/625] eta 0:00:43 lr 0.000071 wd 0.0500 time 0.5803 (0.5805) data time 0.0006 (0.0015) model time 0.5797 (0.5783) loss 7.2380 (6.8725) grad_norm 2.2788 (2.9131) loss_scale 256.0000 (162.3811) mem 22339MB +[2024-07-28 04:13:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][560/625] eta 0:00:37 lr 0.000071 wd 0.0500 time 0.5802 (0.5805) data time 0.0008 (0.0015) model time 0.5794 (0.5782) loss 7.4309 (6.8720) grad_norm 2.4999 (2.9121) loss_scale 256.0000 (164.0499) mem 22339MB +[2024-07-28 04:13:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][570/625] eta 0:00:31 lr 0.000071 wd 0.0500 time 0.5778 (0.5805) data time 0.0006 (0.0015) model time 0.5772 (0.5783) loss 6.6765 (6.8746) grad_norm 2.8414 (2.9258) loss_scale 256.0000 (165.6602) mem 22339MB +[2024-07-28 04:13:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][580/625] eta 0:00:26 lr 0.000071 wd 0.0500 time 0.6011 (0.5805) data time 0.0009 (0.0015) model time 0.6003 (0.5784) loss 7.3901 (6.8745) grad_norm 2.6402 (2.9203) loss_scale 256.0000 (167.2151) mem 22339MB +[2024-07-28 04:13:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][590/625] eta 0:00:20 lr 0.000071 wd 0.0500 time 0.5827 (0.5805) data time 0.0008 (0.0015) model time 0.5819 (0.5784) loss 7.2313 (6.8801) grad_norm 2.5736 (2.9108) loss_scale 256.0000 (168.7174) mem 22339MB +[2024-07-28 04:13:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][600/625] eta 0:00:14 lr 0.000071 wd 0.0500 time 0.5922 (0.5806) data time 0.0008 (0.0015) model time 0.5914 (0.5785) loss 7.8726 (6.8754) grad_norm 1.8968 (2.9078) loss_scale 256.0000 (170.1697) mem 22339MB +[2024-07-28 04:13:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][610/625] eta 0:00:08 lr 0.000071 wd 0.0500 time 0.5831 (0.5808) data time 0.0006 (0.0015) model time 0.5824 (0.5787) loss 7.5816 (6.8737) grad_norm 2.8811 (2.9051) loss_scale 256.0000 (171.5745) mem 22339MB +[2024-07-28 04:13:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [259/300][620/625] eta 0:00:02 lr 0.000071 wd 0.0500 time 0.5766 (0.5808) data time 0.0004 (0.0015) model time 0.5762 (0.5787) loss 6.9483 (6.8752) grad_norm 2.6390 (2.9191) loss_scale 256.0000 (172.9340) mem 22339MB +[2024-07-28 04:13:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 259 training takes 0:06:03 +[2024-07-28 04:13:39 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 04:13:43 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 04:13:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.457 (0.457) Loss 0.4951 (0.4951) Acc@1 90.332 (90.332) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-28 04:13:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.124 (0.155) Loss 0.7397 (0.5999) Acc@1 82.959 (88.179) Acc@5 97.314 (98.140) Mem 22339MB +[2024-07-28 04:13:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.141) Loss 0.8252 (0.6854) Acc@1 80.762 (85.624) Acc@5 96.094 (97.354) Mem 22339MB +[2024-07-28 04:13:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.235 Acc@5 97.347 +[2024-07-28 04:13:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.2% +[2024-07-28 04:13:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 85.23% +[2024-07-28 04:13:46 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-28 04:13:48 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-28 04:13:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.460 (0.460) Loss 0.4990 (0.4990) Acc@1 90.283 (90.283) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-28 04:13:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.124 (0.155) Loss 0.7393 (0.6055) Acc@1 83.154 (88.144) Acc@5 97.217 (98.176) Mem 22339MB +[2024-07-28 04:13:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.124 (0.141) Loss 0.8296 (0.6899) Acc@1 80.664 (85.440) Acc@5 96.191 (97.366) Mem 22339MB +[2024-07-28 04:13:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.049 Acc@5 97.365 +[2024-07-28 04:13:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.0% +[2024-07-28 04:13:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.05% +[2024-07-28 04:13:52 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-28 04:13:53 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-28 04:13:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][0/625] eta 0:08:43 lr 0.000071 wd 0.0500 time 0.8381 (0.8381) data time 0.3219 (0.3219) model time 0.0000 (0.0000) loss 8.1334 (8.1334) grad_norm 2.8429 (2.8429) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:14:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][10/625] eta 0:06:13 lr 0.000071 wd 0.0500 time 0.5772 (0.6068) data time 0.0009 (0.0305) model time 0.0000 (0.0000) loss 6.9082 (6.8329) grad_norm 1.8151 (3.6499) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:14:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][20/625] eta 0:05:58 lr 0.000071 wd 0.0500 time 0.5773 (0.5933) data time 0.0009 (0.0164) model time 0.0000 (0.0000) loss 6.3094 (6.9818) grad_norm 2.4944 (3.0264) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:14:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][30/625] eta 0:05:51 lr 0.000071 wd 0.0500 time 0.5745 (0.5906) data time 0.0008 (0.0114) model time 0.0000 (0.0000) loss 6.9882 (6.9220) grad_norm 2.7603 (2.8473) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:14:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][40/625] eta 0:05:45 lr 0.000071 wd 0.0500 time 0.5747 (0.5902) data time 0.0009 (0.0088) model time 0.0000 (0.0000) loss 6.6035 (6.8789) grad_norm 1.9371 (2.6735) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:14:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][50/625] eta 0:05:41 lr 0.000071 wd 0.0500 time 0.5754 (0.5937) data time 0.0006 (0.0073) model time 0.0000 (0.0000) loss 7.2571 (6.8551) grad_norm 1.9894 (2.5699) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:14:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][60/625] eta 0:05:33 lr 0.000071 wd 0.0500 time 0.5754 (0.5906) data time 0.0008 (0.0062) model time 0.5746 (0.5740) loss 7.2625 (6.9192) grad_norm 82.0437 (4.1175) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:14:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][70/625] eta 0:05:27 lr 0.000071 wd 0.0500 time 0.5721 (0.5897) data time 0.0007 (0.0056) model time 0.5714 (0.5784) loss 6.2304 (6.9565) grad_norm 3.2087 (3.9037) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:14:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][80/625] eta 0:05:20 lr 0.000070 wd 0.0500 time 0.5685 (0.5884) data time 0.0007 (0.0050) model time 0.5678 (0.5783) loss 6.6492 (6.9246) grad_norm 1.8510 (3.7499) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:14:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][90/625] eta 0:05:14 lr 0.000070 wd 0.0500 time 0.5755 (0.5873) data time 0.0008 (0.0046) model time 0.5747 (0.5780) loss 6.9996 (6.9331) grad_norm 1.9883 (3.6543) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:14:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][100/625] eta 0:05:08 lr 0.000070 wd 0.0500 time 0.5774 (0.5870) data time 0.0006 (0.0043) model time 0.5768 (0.5791) loss 7.5643 (6.9401) grad_norm 3.7628 (3.6222) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:14:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][110/625] eta 0:05:02 lr 0.000070 wd 0.0500 time 0.5759 (0.5868) data time 0.0007 (0.0040) model time 0.5752 (0.5798) loss 5.7042 (6.9325) grad_norm 7.7855 (3.6658) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:15:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][120/625] eta 0:04:56 lr 0.000070 wd 0.0500 time 0.5745 (0.5862) data time 0.0009 (0.0037) model time 0.5736 (0.5796) loss 7.3531 (6.9365) grad_norm 1.8523 (3.5522) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:15:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][130/625] eta 0:04:49 lr 0.000070 wd 0.0500 time 0.5770 (0.5858) data time 0.0009 (0.0036) model time 0.5761 (0.5795) loss 6.5328 (6.9400) grad_norm 2.9318 (3.4603) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:15:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][140/625] eta 0:04:44 lr 0.000070 wd 0.0500 time 0.5744 (0.5857) data time 0.0007 (0.0034) model time 0.5737 (0.5800) loss 7.1658 (6.9120) grad_norm 8.4664 (3.4114) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:15:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][150/625] eta 0:04:37 lr 0.000070 wd 0.0500 time 0.5772 (0.5852) data time 0.0006 (0.0032) model time 0.5766 (0.5798) loss 5.9176 (6.9124) grad_norm 5.2131 (3.3933) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:15:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][160/625] eta 0:04:31 lr 0.000070 wd 0.0500 time 0.5766 (0.5847) data time 0.0006 (0.0031) model time 0.5760 (0.5795) loss 7.4956 (6.9060) grad_norm 2.1133 (3.3194) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:15:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][170/625] eta 0:04:25 lr 0.000070 wd 0.0500 time 0.5761 (0.5843) data time 0.0010 (0.0029) model time 0.5751 (0.5793) loss 7.0699 (6.9144) grad_norm 1.7518 (3.2484) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:15:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][180/625] eta 0:04:19 lr 0.000070 wd 0.0500 time 0.5773 (0.5840) data time 0.0006 (0.0028) model time 0.5767 (0.5791) loss 6.1791 (6.9141) grad_norm 2.3435 (3.1902) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:15:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][190/625] eta 0:04:13 lr 0.000070 wd 0.0500 time 0.5750 (0.5838) data time 0.0009 (0.0027) model time 0.5741 (0.5791) loss 6.4246 (6.9067) grad_norm 2.0294 (3.1669) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:15:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][200/625] eta 0:04:07 lr 0.000070 wd 0.0500 time 0.5746 (0.5833) data time 0.0008 (0.0026) model time 0.5737 (0.5788) loss 6.1186 (6.8788) grad_norm 2.3608 (3.1829) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:15:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][210/625] eta 0:04:01 lr 0.000070 wd 0.0500 time 0.5747 (0.5830) data time 0.0006 (0.0025) model time 0.5741 (0.5785) loss 8.0042 (6.8730) grad_norm 2.6786 (3.2192) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:16:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][220/625] eta 0:03:55 lr 0.000070 wd 0.0500 time 0.5745 (0.5827) data time 0.0008 (0.0025) model time 0.5737 (0.5784) loss 7.2337 (6.8720) grad_norm 1.9978 (3.2088) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:16:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][230/625] eta 0:03:50 lr 0.000070 wd 0.0500 time 0.5757 (0.5824) data time 0.0008 (0.0024) model time 0.5748 (0.5782) loss 7.0854 (6.8790) grad_norm 5.7421 (3.2098) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:16:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][240/625] eta 0:03:44 lr 0.000070 wd 0.0500 time 0.5740 (0.5822) data time 0.0008 (0.0023) model time 0.5732 (0.5781) loss 7.5084 (6.9041) grad_norm 2.7834 (3.2192) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:16:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][250/625] eta 0:03:38 lr 0.000070 wd 0.0500 time 0.5747 (0.5820) data time 0.0006 (0.0023) model time 0.5740 (0.5780) loss 6.9923 (6.9156) grad_norm 1.8664 (3.1919) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:16:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][260/625] eta 0:03:32 lr 0.000070 wd 0.0500 time 0.5772 (0.5824) data time 0.0008 (0.0022) model time 0.5763 (0.5787) loss 7.0007 (6.9214) grad_norm 3.5208 (3.1725) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:16:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][270/625] eta 0:03:27 lr 0.000070 wd 0.0500 time 0.5734 (0.5835) data time 0.0006 (0.0022) model time 0.5728 (0.5801) loss 5.2904 (6.9215) grad_norm 1.8392 (3.1516) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:16:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][280/625] eta 0:03:21 lr 0.000070 wd 0.0500 time 0.5774 (0.5832) data time 0.0008 (0.0021) model time 0.5767 (0.5799) loss 7.3980 (6.9235) grad_norm 1.7162 (3.1329) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:16:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][290/625] eta 0:03:15 lr 0.000069 wd 0.0500 time 0.5741 (0.5830) data time 0.0006 (0.0021) model time 0.5735 (0.5798) loss 6.9842 (6.9251) grad_norm 2.6885 (3.1056) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:16:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][300/625] eta 0:03:09 lr 0.000069 wd 0.0500 time 0.5753 (0.5829) data time 0.0008 (0.0021) model time 0.5745 (0.5797) loss 6.5284 (6.9227) grad_norm 2.1018 (3.0896) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:16:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][310/625] eta 0:03:03 lr 0.000069 wd 0.0500 time 0.5765 (0.5828) data time 0.0008 (0.0020) model time 0.5758 (0.5796) loss 6.5491 (6.9210) grad_norm 1.8379 (3.0822) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:17:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][320/625] eta 0:02:57 lr 0.000069 wd 0.0500 time 0.5769 (0.5827) data time 0.0008 (0.0020) model time 0.5761 (0.5797) loss 6.3042 (6.9212) grad_norm 1.9451 (3.0604) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:17:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][330/625] eta 0:02:51 lr 0.000069 wd 0.0500 time 0.5747 (0.5826) data time 0.0008 (0.0019) model time 0.5739 (0.5796) loss 5.7051 (6.9187) grad_norm 6.9028 (3.0540) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:17:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][340/625] eta 0:02:46 lr 0.000069 wd 0.0500 time 0.5767 (0.5826) data time 0.0006 (0.0019) model time 0.5762 (0.5797) loss 7.2278 (6.9156) grad_norm 1.9323 (3.0485) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:17:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][350/625] eta 0:02:40 lr 0.000069 wd 0.0500 time 0.5738 (0.5824) data time 0.0009 (0.0019) model time 0.5729 (0.5796) loss 8.6159 (6.9140) grad_norm 1.9380 (3.0302) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:17:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][360/625] eta 0:02:34 lr 0.000069 wd 0.0500 time 0.5746 (0.5823) data time 0.0008 (0.0019) model time 0.5738 (0.5794) loss 6.9429 (6.9093) grad_norm 2.0110 (3.0265) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:17:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][370/625] eta 0:02:28 lr 0.000069 wd 0.0500 time 0.5745 (0.5826) data time 0.0008 (0.0018) model time 0.5738 (0.5799) loss 6.8668 (6.8957) grad_norm 2.4464 (3.0312) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:17:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][380/625] eta 0:02:22 lr 0.000069 wd 0.0500 time 0.5758 (0.5825) data time 0.0007 (0.0018) model time 0.5751 (0.5798) loss 7.0075 (6.8828) grad_norm 2.5546 (3.0386) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:17:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][390/625] eta 0:02:16 lr 0.000069 wd 0.0500 time 0.5755 (0.5828) data time 0.0008 (0.0019) model time 0.5746 (0.5801) loss 7.2562 (6.8912) grad_norm 1.8012 (3.0430) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:17:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][400/625] eta 0:02:11 lr 0.000069 wd 0.0500 time 0.5757 (0.5828) data time 0.0008 (0.0019) model time 0.5749 (0.5801) loss 7.0640 (6.8933) grad_norm 1.7782 (3.0310) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:17:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][410/625] eta 0:02:05 lr 0.000069 wd 0.0500 time 0.5760 (0.5828) data time 0.0007 (0.0019) model time 0.5753 (0.5801) loss 5.9558 (6.8867) grad_norm 2.8165 (3.0223) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:17:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][420/625] eta 0:01:59 lr 0.000069 wd 0.0500 time 0.5756 (0.5827) data time 0.0008 (0.0019) model time 0.5748 (0.5801) loss 7.8115 (6.8854) grad_norm 2.4073 (3.0128) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:18:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][430/625] eta 0:01:53 lr 0.000069 wd 0.0500 time 0.5746 (0.5828) data time 0.0006 (0.0018) model time 0.5740 (0.5802) loss 6.3197 (6.8802) grad_norm 2.5490 (3.0102) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:18:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][440/625] eta 0:01:47 lr 0.000069 wd 0.0500 time 0.5755 (0.5828) data time 0.0006 (0.0018) model time 0.5749 (0.5803) loss 6.5369 (6.8746) grad_norm 2.5928 (3.0046) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:18:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][450/625] eta 0:01:41 lr 0.000069 wd 0.0500 time 0.5806 (0.5828) data time 0.0006 (0.0018) model time 0.5799 (0.5802) loss 6.5823 (6.8621) grad_norm 3.6407 (3.0270) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:18:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][460/625] eta 0:01:36 lr 0.000069 wd 0.0500 time 0.5752 (0.5828) data time 0.0009 (0.0018) model time 0.5744 (0.5803) loss 7.6186 (6.8619) grad_norm 3.4463 (3.0147) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:18:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][470/625] eta 0:01:30 lr 0.000069 wd 0.0500 time 0.5777 (0.5827) data time 0.0006 (0.0018) model time 0.5771 (0.5803) loss 7.0795 (6.8600) grad_norm 3.6237 (3.0042) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:18:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][480/625] eta 0:01:24 lr 0.000069 wd 0.0500 time 0.5765 (0.5829) data time 0.0009 (0.0018) model time 0.5756 (0.5805) loss 7.2016 (6.8668) grad_norm 2.9168 (2.9937) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:18:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][490/625] eta 0:01:18 lr 0.000069 wd 0.0500 time 0.5761 (0.5834) data time 0.0006 (0.0017) model time 0.5755 (0.5811) loss 5.8538 (6.8791) grad_norm 2.4279 (2.9828) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:18:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][500/625] eta 0:01:12 lr 0.000069 wd 0.0500 time 0.5752 (0.5833) data time 0.0006 (0.0017) model time 0.5746 (0.5810) loss 7.3702 (6.8795) grad_norm 1.9003 (2.9654) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:18:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][510/625] eta 0:01:07 lr 0.000068 wd 0.0500 time 0.5769 (0.5831) data time 0.0007 (0.0017) model time 0.5762 (0.5809) loss 6.0738 (6.8769) grad_norm 2.1164 (2.9562) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:18:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][520/625] eta 0:01:01 lr 0.000068 wd 0.0500 time 0.5753 (0.5830) data time 0.0007 (0.0017) model time 0.5746 (0.5808) loss 6.1781 (6.8864) grad_norm 2.1585 (2.9498) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:19:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][530/625] eta 0:00:55 lr 0.000068 wd 0.0500 time 0.5768 (0.5829) data time 0.0009 (0.0017) model time 0.5759 (0.5807) loss 5.6089 (6.8749) grad_norm 2.7005 (2.9510) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:19:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][540/625] eta 0:00:49 lr 0.000068 wd 0.0500 time 0.5734 (0.5831) data time 0.0006 (0.0017) model time 0.5727 (0.5809) loss 6.3646 (6.8680) grad_norm 2.0944 (2.9486) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:19:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][550/625] eta 0:00:43 lr 0.000068 wd 0.0500 time 0.5768 (0.5830) data time 0.0007 (0.0017) model time 0.5762 (0.5808) loss 6.6584 (6.8660) grad_norm 1.7650 (2.9381) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:19:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][560/625] eta 0:00:37 lr 0.000068 wd 0.0500 time 0.5763 (0.5829) data time 0.0008 (0.0016) model time 0.5755 (0.5807) loss 7.2742 (6.8692) grad_norm 1.8391 (2.9277) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:19:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][570/625] eta 0:00:32 lr 0.000068 wd 0.0500 time 0.5760 (0.5828) data time 0.0006 (0.0016) model time 0.5754 (0.5806) loss 7.0989 (6.8732) grad_norm 68.6494 (3.0361) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:19:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][580/625] eta 0:00:26 lr 0.000068 wd 0.0500 time 0.5780 (0.5828) data time 0.0006 (0.0016) model time 0.5773 (0.5806) loss 7.4009 (6.8751) grad_norm 2.0941 (3.0257) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:19:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][590/625] eta 0:00:20 lr 0.000068 wd 0.0500 time 0.5736 (0.5828) data time 0.0007 (0.0016) model time 0.5729 (0.5807) loss 6.6464 (6.8792) grad_norm 1.9494 (3.0119) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:19:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][600/625] eta 0:00:14 lr 0.000068 wd 0.0500 time 0.5756 (0.5827) data time 0.0006 (0.0016) model time 0.5751 (0.5806) loss 5.6132 (6.8766) grad_norm 1.9962 (3.0112) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:19:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][610/625] eta 0:00:08 lr 0.000068 wd 0.0500 time 0.5743 (0.5827) data time 0.0006 (0.0016) model time 0.5737 (0.5806) loss 8.1429 (6.8786) grad_norm 7.1763 (3.0088) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:19:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [260/300][620/625] eta 0:00:02 lr 0.000068 wd 0.0500 time 0.5777 (0.5826) data time 0.0006 (0.0016) model time 0.5771 (0.5806) loss 6.5242 (6.8800) grad_norm 2.3571 (3.0013) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:19:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 260 training takes 0:06:04 +[2024-07-28 04:19:57 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 04:19:59 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 04:20:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.457 (0.457) Loss 0.4890 (0.4890) Acc@1 90.430 (90.430) Acc@5 99.072 (99.072) Mem 22339MB +[2024-07-28 04:20:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.155) Loss 0.7378 (0.5968) Acc@1 82.959 (88.197) Acc@5 97.168 (98.171) Mem 22339MB +[2024-07-28 04:20:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.124 (0.141) Loss 0.8159 (0.6820) Acc@1 81.055 (85.538) Acc@5 96.289 (97.354) Mem 22339MB +[2024-07-28 04:20:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.167 Acc@5 97.337 +[2024-07-28 04:20:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.2% +[2024-07-28 04:20:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.794 (0.794) Loss 0.4990 (0.4990) Acc@1 90.283 (90.283) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-28 04:20:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.186) Loss 0.7388 (0.6050) Acc@1 83.252 (88.161) Acc@5 97.168 (98.176) Mem 22339MB +[2024-07-28 04:20:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.157) Loss 0.8291 (0.6893) Acc@1 80.713 (85.459) Acc@5 96.191 (97.370) Mem 22339MB +[2024-07-28 04:20:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.069 Acc@5 97.369 +[2024-07-28 04:20:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.1% +[2024-07-28 04:20:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.07% +[2024-07-28 04:20:06 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-28 04:20:13 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-28 04:20:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][0/625] eta 0:09:37 lr 0.000068 wd 0.0500 time 0.9245 (0.9245) data time 0.4084 (0.4084) model time 0.0000 (0.0000) loss 7.2537 (7.2537) grad_norm 2.3355 (2.3355) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:20:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][10/625] eta 0:06:21 lr 0.000068 wd 0.0500 time 0.5757 (0.6200) data time 0.0007 (0.0380) model time 0.0000 (0.0000) loss 5.6762 (6.5517) grad_norm 2.0119 (3.2030) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:20:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][20/625] eta 0:06:03 lr 0.000068 wd 0.0500 time 0.5747 (0.6002) data time 0.0007 (0.0203) model time 0.0000 (0.0000) loss 7.8519 (6.8644) grad_norm 2.5193 (3.0314) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:20:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][30/625] eta 0:05:53 lr 0.000068 wd 0.0500 time 0.5732 (0.5935) data time 0.0009 (0.0146) model time 0.0000 (0.0000) loss 7.7708 (6.9098) grad_norm 2.9177 (2.8375) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:20:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][40/625] eta 0:05:45 lr 0.000068 wd 0.0500 time 0.5753 (0.5910) data time 0.0006 (0.0116) model time 0.0000 (0.0000) loss 5.5500 (6.9361) grad_norm 2.1768 (2.6969) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:20:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][50/625] eta 0:05:38 lr 0.000068 wd 0.0500 time 0.5751 (0.5884) data time 0.0009 (0.0095) model time 0.0000 (0.0000) loss 7.1926 (6.8889) grad_norm 2.5726 (2.7357) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:20:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][60/625] eta 0:05:32 lr 0.000068 wd 0.0500 time 0.5755 (0.5882) data time 0.0006 (0.0084) model time 0.5749 (0.5843) loss 6.7736 (6.9045) grad_norm 2.1872 (2.6988) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:20:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][70/625] eta 0:05:27 lr 0.000068 wd 0.0500 time 0.7484 (0.5899) data time 0.0006 (0.0075) model time 0.7478 (0.5912) loss 7.0143 (6.8829) grad_norm 2.7473 (2.6607) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:21:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][80/625] eta 0:05:23 lr 0.000068 wd 0.0500 time 0.5788 (0.5929) data time 0.0009 (0.0067) model time 0.5779 (0.5986) loss 7.3577 (6.8729) grad_norm 1.9617 (2.6291) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:21:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][90/625] eta 0:05:17 lr 0.000068 wd 0.0500 time 0.5756 (0.5939) data time 0.0009 (0.0061) model time 0.5747 (0.5992) loss 6.5695 (6.9028) grad_norm 3.4515 (2.6264) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:21:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][100/625] eta 0:05:11 lr 0.000068 wd 0.0500 time 0.5734 (0.5928) data time 0.0009 (0.0056) model time 0.5725 (0.5958) loss 6.7195 (6.8686) grad_norm 2.3898 (2.6057) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:21:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][110/625] eta 0:05:04 lr 0.000067 wd 0.0500 time 0.5758 (0.5914) data time 0.0006 (0.0051) model time 0.5752 (0.5926) loss 7.5976 (6.8502) grad_norm 2.7824 (2.5991) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:21:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][120/625] eta 0:04:58 lr 0.000067 wd 0.0500 time 0.5755 (0.5905) data time 0.0006 (0.0048) model time 0.5748 (0.5906) loss 6.4277 (6.8551) grad_norm 1.5978 (2.5839) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:21:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][130/625] eta 0:04:52 lr 0.000067 wd 0.0500 time 0.5756 (0.5902) data time 0.0006 (0.0047) model time 0.5750 (0.5897) loss 8.1227 (6.8409) grad_norm 2.0336 (2.7016) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:21:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][140/625] eta 0:04:46 lr 0.000067 wd 0.0500 time 0.5749 (0.5898) data time 0.0006 (0.0044) model time 0.5743 (0.5891) loss 6.0118 (6.8580) grad_norm 4.0762 (2.6943) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:21:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][150/625] eta 0:04:39 lr 0.000067 wd 0.0500 time 0.5776 (0.5891) data time 0.0006 (0.0042) model time 0.5769 (0.5879) loss 6.8160 (6.8584) grad_norm 3.5766 (2.6751) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:21:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][160/625] eta 0:04:33 lr 0.000067 wd 0.0500 time 0.5756 (0.5885) data time 0.0007 (0.0040) model time 0.5749 (0.5872) loss 7.0626 (6.8484) grad_norm 2.1897 (2.6432) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-28 04:21:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-28 04:21:48 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 04:21:51 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 04:25:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-28 04:25:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-28 04:25:36 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-28 04:26:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-28 04:26:00 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-28 04:26:00 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-28 04:26:00 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-28 04:26:01 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 261) +[2024-07-28 04:26:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-28 04:26:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-28 04:26:14 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 04:26:17 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 04:28:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-28 04:28:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-28 04:28:45 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-28 04:28:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-28 04:28:58 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-28 04:28:58 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-28 04:28:58 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-28 04:28:59 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 261) +[2024-07-28 04:28:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-28 04:29:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-28 04:29:14 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 04:29:17 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 04:33:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-28 04:33:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-28 04:34:20 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-28 04:34:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-28 04:34:33 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-28 04:34:33 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-28 04:34:34 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-28 04:34:34 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 261) +[2024-07-28 04:34:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-28 04:34:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][170/625] eta 0:15:43 lr 0.000067 wd 0.0500 time 0.5879 (2.0746) data time 0.0009 (0.0819) model time 0.5869 (1.9927) loss 6.9479 (7.0215) grad_norm 2.3171 (2.4562) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 04:35:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][180/625] eta 0:09:15 lr 0.000067 wd 0.0500 time 0.5917 (1.2482) data time 0.0009 (0.0370) model time 0.5908 (1.2112) loss 7.6966 (6.9603) grad_norm 2.0527 (2.3847) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 04:35:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][190/625] eta 0:07:20 lr 0.000067 wd 0.0500 time 0.5769 (1.0115) data time 0.0010 (0.0243) model time 0.5758 (0.9873) loss 7.2524 (6.9489) grad_norm 1.8973 (2.3369) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 04:35:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][200/625] eta 0:06:21 lr 0.000067 wd 0.0500 time 0.5792 (0.8985) data time 0.0010 (0.0182) model time 0.5781 (0.8803) loss 6.6130 (6.9020) grad_norm 2.8168 (2.5452) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 04:35:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][210/625] eta 0:05:45 lr 0.000067 wd 0.0500 time 0.5777 (0.8327) data time 0.0008 (0.0147) model time 0.5769 (0.8179) loss 7.1600 (6.8800) grad_norm 2.0096 (2.4945) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 04:35:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][220/625] eta 0:05:22 lr 0.000067 wd 0.0500 time 0.5695 (0.7965) data time 0.0014 (0.0124) model time 0.5681 (0.7841) loss 6.2758 (6.8799) grad_norm 1.6237 (2.5195) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 04:35:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][230/625] eta 0:05:02 lr 0.000067 wd 0.0500 time 0.5825 (0.7647) data time 0.0008 (0.0109) model time 0.5817 (0.7538) loss 5.7815 (6.8831) grad_norm 1.6363 (2.5305) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 04:35:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][240/625] eta 0:04:45 lr 0.000067 wd 0.0500 time 0.6162 (0.7427) data time 0.0009 (0.0097) model time 0.6153 (0.7330) loss 6.8445 (6.8875) grad_norm 3.0194 (2.5173) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 04:35:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][250/625] eta 0:04:31 lr 0.000067 wd 0.0500 time 0.5824 (0.7242) data time 0.0010 (0.0087) model time 0.5814 (0.7155) loss 7.3946 (6.8614) grad_norm 2.9477 (2.5593) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 04:35:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][260/625] eta 0:04:19 lr 0.000067 wd 0.0500 time 0.5796 (0.7098) data time 0.0008 (0.0080) model time 0.5788 (0.7017) loss 8.0996 (6.8859) grad_norm 2.2612 (2.5876) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 04:35:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][270/625] eta 0:04:07 lr 0.000067 wd 0.0500 time 0.5806 (0.6980) data time 0.0010 (0.0074) model time 0.5796 (0.6906) loss 5.6326 (6.9023) grad_norm 1.8948 (2.6142) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 04:35:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][280/625] eta 0:03:57 lr 0.000067 wd 0.0500 time 0.5832 (0.6883) data time 0.0010 (0.0069) model time 0.5822 (0.6814) loss 6.6728 (6.9032) grad_norm 3.0734 (2.8715) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 04:36:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][290/625] eta 0:03:47 lr 0.000067 wd 0.0500 time 0.5960 (0.6803) data time 0.0007 (0.0064) model time 0.5953 (0.6739) loss 6.5859 (6.9024) grad_norm 2.1515 (2.8501) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 04:36:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][300/625] eta 0:03:38 lr 0.000067 wd 0.0500 time 0.5839 (0.6733) data time 0.0010 (0.0060) model time 0.5828 (0.6673) loss 6.0413 (6.9102) grad_norm 2.8176 (2.8384) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 04:36:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][310/625] eta 0:03:30 lr 0.000067 wd 0.0500 time 0.5845 (0.6676) data time 0.0008 (0.0057) model time 0.5837 (0.6619) loss 7.4049 (6.9012) grad_norm 2.0897 (2.9415) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 04:36:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][320/625] eta 0:03:22 lr 0.000067 wd 0.0500 time 0.5849 (0.6624) data time 0.0008 (0.0054) model time 0.5842 (0.6570) loss 6.8811 (6.8981) grad_norm 28.5142 (3.1060) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 04:36:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][330/625] eta 0:03:14 lr 0.000066 wd 0.0500 time 0.5848 (0.6578) data time 0.0011 (0.0052) model time 0.5837 (0.6526) loss 7.7938 (6.9200) grad_norm 2.9396 (3.1325) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-28 04:36:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-28 04:36:30 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-28 04:36:36 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-28 04:41:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-28 04:41:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-28 22:36:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-28 22:36:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-28 22:37:10 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-28 22:37:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-28 22:37:24 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-28 22:37:25 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-28 22:37:25 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-28 22:37:25 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 261) +[2024-07-28 22:37:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 00:30:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 00:30:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 00:31:07 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 00:31:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 00:31:29 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 00:31:29 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 00:31:30 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 00:31:30 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 261) +[2024-07-29 00:31:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 00:31:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][340/625] eta 0:08:19 lr 0.000066 wd 0.0500 time 0.5728 (1.7523) data time 0.0006 (0.0761) model time 0.5722 (1.6762) loss 7.7024 (7.3200) grad_norm 1.7284 (2.8141) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 00:31:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][350/625] eta 0:05:01 lr 0.000066 wd 0.0500 time 0.5718 (1.0960) data time 0.0006 (0.0343) model time 0.5712 (1.0617) loss 7.6670 (7.2318) grad_norm 3.0986 (2.8478) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 00:31:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][360/625] eta 0:04:00 lr 0.000066 wd 0.0500 time 0.5651 (0.9092) data time 0.0009 (0.0224) model time 0.5642 (0.8869) loss 7.6609 (7.1310) grad_norm 2.1554 (2.9229) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 00:32:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][370/625] eta 0:03:29 lr 0.000066 wd 0.0500 time 0.5709 (0.8203) data time 0.0008 (0.0167) model time 0.5701 (0.8036) loss 6.7774 (7.0404) grad_norm 2.8972 (2.8258) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 00:32:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][380/625] eta 0:03:08 lr 0.000066 wd 0.0500 time 0.5686 (0.7681) data time 0.0006 (0.0134) model time 0.5679 (0.7547) loss 7.2097 (7.0173) grad_norm 2.8597 (2.7645) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 00:32:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][390/625] eta 0:02:53 lr 0.000066 wd 0.0500 time 0.5682 (0.7398) data time 0.0006 (0.0113) model time 0.5676 (0.7285) loss 5.5749 (6.9590) grad_norm 3.1653 (2.7845) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 00:32:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][400/625] eta 0:02:40 lr 0.000066 wd 0.0500 time 0.5740 (0.7151) data time 0.0006 (0.0098) model time 0.5734 (0.7054) loss 6.5478 (6.9254) grad_norm 2.7620 (2.8159) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 00:32:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][410/625] eta 0:02:29 lr 0.000066 wd 0.0500 time 0.5768 (0.6969) data time 0.0006 (0.0086) model time 0.5763 (0.6883) loss 6.4933 (6.9247) grad_norm 5.1988 (2.8189) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 00:32:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][420/625] eta 0:02:20 lr 0.000066 wd 0.0500 time 0.5781 (0.6832) data time 0.0008 (0.0078) model time 0.5773 (0.6754) loss 7.5184 (6.9047) grad_norm 4.2792 (2.8252) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 00:32:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][430/625] eta 0:02:11 lr 0.000066 wd 0.0500 time 0.5767 (0.6719) data time 0.0007 (0.0071) model time 0.5760 (0.6648) loss 7.8567 (6.9207) grad_norm 4.2238 (2.8962) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 00:32:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][440/625] eta 0:02:02 lr 0.000066 wd 0.0500 time 0.5715 (0.6627) data time 0.0006 (0.0065) model time 0.5709 (0.6562) loss 5.9869 (6.9240) grad_norm 2.3685 (2.8276) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 00:32:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][450/625] eta 0:01:54 lr 0.000066 wd 0.0500 time 0.5737 (0.6552) data time 0.0008 (0.0060) model time 0.5729 (0.6491) loss 7.4788 (6.9370) grad_norm 1.8176 (2.7832) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 00:32:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][460/625] eta 0:01:47 lr 0.000066 wd 0.0500 time 0.5742 (0.6488) data time 0.0006 (0.0056) model time 0.5736 (0.6431) loss 6.9463 (6.9382) grad_norm 1.8156 (2.8031) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 00:33:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][470/625] eta 0:01:39 lr 0.000066 wd 0.0500 time 0.5766 (0.6434) data time 0.0008 (0.0053) model time 0.5759 (0.6382) loss 6.5534 (6.9235) grad_norm 3.4117 (2.8097) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 00:33:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][480/625] eta 0:01:32 lr 0.000066 wd 0.0500 time 0.5774 (0.6388) data time 0.0006 (0.0050) model time 0.5767 (0.6338) loss 6.3795 (6.9132) grad_norm 2.5457 (2.7919) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 00:33:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][490/625] eta 0:01:25 lr 0.000066 wd 0.0500 time 0.5772 (0.6349) data time 0.0006 (0.0047) model time 0.5766 (0.6302) loss 6.2676 (6.9024) grad_norm 2.2721 (2.7756) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 00:33:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][500/625] eta 0:01:18 lr 0.000066 wd 0.0500 time 0.5824 (0.6315) data time 0.0008 (0.0045) model time 0.5815 (0.6270) loss 7.9164 (6.9231) grad_norm 2.7818 (2.7502) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 00:33:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 00:33:25 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 00:33:30 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 00:45:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 00:45:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 00:46:02 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 00:46:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 00:46:12 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 00:46:13 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 00:46:13 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 00:46:13 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 261) +[2024-07-29 00:46:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 00:46:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][510/625] eta 0:16:13 lr 0.000066 wd 0.0500 time 8.4636 (8.4636) data time 0.7117 (0.7117) model time 7.7519 (7.7519) loss 7.0677 (7.0677) grad_norm 2.3018 (2.3018) loss_scale 256.0000 (256.0000) mem 26016MB +[2024-07-29 00:46:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][520/625] eta 0:02:26 lr 0.000066 wd 0.0500 time 0.5769 (1.3925) data time 0.0008 (0.0656) model time 0.5761 (1.3268) loss 6.8411 (7.0980) grad_norm 3.0182 (2.3142) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 00:46:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][530/625] eta 0:01:35 lr 0.000066 wd 0.0500 time 0.5686 (1.0041) data time 0.0012 (0.0348) model time 0.5674 (0.9693) loss 7.1892 (7.0395) grad_norm 7.5316 (2.9130) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 00:46:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][540/625] eta 0:01:13 lr 0.000066 wd 0.0500 time 0.5758 (0.8652) data time 0.0006 (0.0239) model time 0.5752 (0.8414) loss 5.6124 (7.1297) grad_norm 2.1711 (2.7839) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 00:46:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][550/625] eta 0:00:59 lr 0.000066 wd 0.0500 time 0.5766 (0.7947) data time 0.0008 (0.0183) model time 0.5758 (0.7765) loss 6.8102 (7.0722) grad_norm 3.8065 (2.8552) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 00:46:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][560/625] eta 0:00:49 lr 0.000065 wd 0.0500 time 0.7645 (0.7557) data time 0.0006 (0.0149) model time 0.7639 (0.7408) loss 7.6304 (7.0413) grad_norm 1.7839 (2.7728) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 00:47:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][570/625] eta 0:00:40 lr 0.000065 wd 0.0500 time 0.5792 (0.7292) data time 0.0009 (0.0126) model time 0.5783 (0.7166) loss 6.5468 (6.9704) grad_norm 3.0018 (2.7853) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 00:47:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][580/625] eta 0:00:31 lr 0.000065 wd 0.0500 time 0.5707 (0.7080) data time 0.0008 (0.0109) model time 0.5699 (0.6971) loss 6.7547 (6.9339) grad_norm 2.2218 (2.7674) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 00:47:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][590/625] eta 0:00:24 lr 0.000065 wd 0.0500 time 0.5734 (0.6921) data time 0.0008 (0.0097) model time 0.5726 (0.6824) loss 6.1471 (6.9074) grad_norm 3.2435 (2.8034) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 00:47:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][600/625] eta 0:00:16 lr 0.000065 wd 0.0500 time 0.5724 (0.6796) data time 0.0007 (0.0087) model time 0.5718 (0.6709) loss 8.2726 (6.8996) grad_norm 3.9618 (2.8992) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 00:47:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][610/625] eta 0:00:10 lr 0.000065 wd 0.0500 time 0.5768 (0.6696) data time 0.0004 (0.0080) model time 0.5764 (0.6616) loss 7.2626 (6.9137) grad_norm 2.8138 (2.9199) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 00:47:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [261/300][620/625] eta 0:00:03 lr 0.000065 wd 0.0500 time 0.5775 (0.6613) data time 0.0007 (0.0073) model time 0.5768 (0.6539) loss 6.7854 (6.9011) grad_norm 1.9590 (2.9193) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 00:47:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 261 training takes 0:01:15 +[2024-07-29 00:47:33 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 00:47:40 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 00:47:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.477 (0.477) Loss 0.4937 (0.4937) Acc@1 90.576 (90.576) Acc@5 98.975 (98.975) Mem 22344MB +[2024-07-29 00:47:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.124 (0.157) Loss 0.7354 (0.5971) Acc@1 82.910 (88.232) Acc@5 97.412 (98.180) Mem 22344MB +[2024-07-29 00:47:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.141) Loss 0.8198 (0.6840) Acc@1 81.104 (85.545) Acc@5 96.143 (97.368) Mem 22344MB +[2024-07-29 00:47:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.165 Acc@5 97.381 +[2024-07-29 00:47:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.2% +[2024-07-29 00:47:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.717 (0.717) Loss 0.4990 (0.4990) Acc@1 90.332 (90.332) Acc@5 98.926 (98.926) Mem 22344MB +[2024-07-29 00:47:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.124 (0.178) Loss 0.7378 (0.6049) Acc@1 83.203 (88.161) Acc@5 97.119 (98.171) Mem 22344MB +[2024-07-29 00:47:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.124 (0.153) Loss 0.8286 (0.6890) Acc@1 80.811 (85.477) Acc@5 96.191 (97.366) Mem 22344MB +[2024-07-29 00:47:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.085 Acc@5 97.359 +[2024-07-29 00:47:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.1% +[2024-07-29 00:47:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.08% +[2024-07-29 00:47:49 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 00:47:52 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 00:47:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][0/625] eta 0:11:26 lr 0.000065 wd 0.0500 time 1.0978 (1.0978) data time 0.3628 (0.3628) model time 0.0000 (0.0000) loss 7.7441 (7.7441) grad_norm 2.5832 (2.5832) loss_scale 256.0000 (256.0000) mem 22337MB +[2024-07-29 00:47:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][10/625] eta 0:06:25 lr 0.000065 wd 0.0500 time 0.5881 (0.6266) data time 0.0006 (0.0337) model time 0.0000 (0.0000) loss 5.6374 (6.7702) grad_norm 1.5392 (3.5138) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 00:48:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][20/625] eta 0:06:05 lr 0.000065 wd 0.0500 time 0.5783 (0.6044) data time 0.0009 (0.0180) model time 0.0000 (0.0000) loss 5.8519 (6.8652) grad_norm 2.1211 (3.0506) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 00:48:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][30/625] eta 0:05:54 lr 0.000065 wd 0.0500 time 0.5803 (0.5958) data time 0.0006 (0.0125) model time 0.0000 (0.0000) loss 6.4539 (6.8355) grad_norm 1.6732 (2.8466) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 00:48:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][40/625] eta 0:05:45 lr 0.000065 wd 0.0500 time 0.5757 (0.5914) data time 0.0008 (0.0096) model time 0.0000 (0.0000) loss 7.0961 (6.8673) grad_norm 2.0015 (2.7182) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 00:48:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][50/625] eta 0:05:38 lr 0.000065 wd 0.0500 time 0.5815 (0.5891) data time 0.0009 (0.0079) model time 0.0000 (0.0000) loss 7.1441 (6.9010) grad_norm 3.9124 (2.7180) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 00:48:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][60/625] eta 0:05:31 lr 0.000065 wd 0.0500 time 0.5783 (0.5876) data time 0.0006 (0.0068) model time 0.5777 (0.5789) loss 6.7191 (6.8536) grad_norm 3.1969 (2.8570) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 00:48:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][70/625] eta 0:05:25 lr 0.000065 wd 0.0500 time 0.5816 (0.5865) data time 0.0008 (0.0059) model time 0.5808 (0.5790) loss 5.9627 (6.8506) grad_norm 2.6173 (2.8104) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 00:48:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][80/625] eta 0:05:19 lr 0.000065 wd 0.0500 time 0.5820 (0.5858) data time 0.0007 (0.0053) model time 0.5813 (0.5791) loss 7.1232 (6.8525) grad_norm 2.2479 (2.7586) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 00:48:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][90/625] eta 0:05:13 lr 0.000065 wd 0.0500 time 0.5839 (0.5859) data time 0.0006 (0.0049) model time 0.5833 (0.5809) loss 5.5594 (6.8028) grad_norm 2.1128 (2.8531) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 00:48:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][100/625] eta 0:05:07 lr 0.000065 wd 0.0500 time 0.5774 (0.5852) data time 0.0007 (0.0045) model time 0.5767 (0.5803) loss 6.1030 (6.8257) grad_norm 2.5511 (2.9696) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 00:48:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 00:48:55 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 00:48:56 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 00:52:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 00:52:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 00:53:02 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 01:15:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 01:15:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 01:23:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 01:23:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 01:23:44 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 01:32:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 01:32:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 01:38:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 01:38:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 01:39:05 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 01:39:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 01:39:22 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 01:39:23 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 01:39:23 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 01:39:23 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 262) +[2024-07-29 01:39:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 01:39:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][110/625] eta 0:26:04 lr 0.000065 wd 0.0500 time 0.5803 (3.0371) data time 0.0008 (0.1838) model time 0.5794 (2.8533) loss 6.3804 (7.2316) grad_norm 2.2182 (2.6262) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 01:39:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][120/625] eta 0:10:44 lr 0.000065 wd 0.0500 time 0.5795 (1.2770) data time 0.0006 (0.0531) model time 0.5789 (1.2238) loss 7.3541 (6.9624) grad_norm 2.3904 (2.7429) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 01:39:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][130/625] eta 0:08:07 lr 0.000065 wd 0.0500 time 0.5713 (0.9843) data time 0.0009 (0.0314) model time 0.5704 (0.9528) loss 6.6854 (7.0206) grad_norm 1.8964 (2.6178) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 01:39:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][140/625] eta 0:06:58 lr 0.000065 wd 0.0500 time 0.5789 (0.8628) data time 0.0007 (0.0224) model time 0.5782 (0.8404) loss 6.2680 (7.0577) grad_norm 2.2309 (2.8116) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 01:40:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][150/625] eta 0:06:18 lr 0.000065 wd 0.0500 time 0.5733 (0.7972) data time 0.0006 (0.0176) model time 0.5727 (0.7796) loss 6.9218 (6.9906) grad_norm 1.9108 (2.7325) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 01:40:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][160/625] eta 0:05:52 lr 0.000064 wd 0.0500 time 0.5701 (0.7584) data time 0.0006 (0.0145) model time 0.5695 (0.7439) loss 7.0588 (6.9928) grad_norm 1.8530 (2.6907) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 01:40:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][170/625] eta 0:05:33 lr 0.000064 wd 0.0500 time 0.5794 (0.7329) data time 0.0007 (0.0123) model time 0.5788 (0.7206) loss 7.5613 (6.9603) grad_norm 2.4065 (2.6793) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 01:40:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][180/625] eta 0:05:16 lr 0.000064 wd 0.0500 time 0.5783 (0.7117) data time 0.0008 (0.0108) model time 0.5775 (0.7009) loss 7.0553 (6.9613) grad_norm 2.5142 (2.6803) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 01:40:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][190/625] eta 0:05:02 lr 0.000064 wd 0.0500 time 0.5825 (0.6958) data time 0.0008 (0.0097) model time 0.5817 (0.6861) loss 7.2747 (6.9500) grad_norm 4.0984 (2.6911) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 01:40:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][200/625] eta 0:04:50 lr 0.000064 wd 0.0500 time 0.5806 (0.6832) data time 0.0009 (0.0087) model time 0.5797 (0.6745) loss 5.8160 (6.9141) grad_norm 2.8524 (2.6830) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 01:40:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][210/625] eta 0:04:39 lr 0.000064 wd 0.0500 time 0.5905 (0.6730) data time 0.0009 (0.0080) model time 0.5896 (0.6650) loss 6.6079 (6.9514) grad_norm 2.0367 (2.6663) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 01:40:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][220/625] eta 0:04:29 lr 0.000064 wd 0.0500 time 0.5822 (0.6645) data time 0.0009 (0.0074) model time 0.5813 (0.6571) loss 7.3196 (6.9305) grad_norm 2.2931 (2.6256) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 01:40:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][230/625] eta 0:04:19 lr 0.000064 wd 0.0500 time 0.5791 (0.6574) data time 0.0009 (0.0068) model time 0.5782 (0.6506) loss 5.8076 (6.9087) grad_norm 2.0822 (2.8037) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 01:40:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][240/625] eta 0:04:10 lr 0.000064 wd 0.0500 time 0.5786 (0.6515) data time 0.0009 (0.0064) model time 0.5777 (0.6451) loss 7.7564 (6.9135) grad_norm 2.4793 (2.8116) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 01:40:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][250/625] eta 0:04:02 lr 0.000064 wd 0.0500 time 0.5855 (0.6466) data time 0.0007 (0.0060) model time 0.5848 (0.6406) loss 6.3549 (6.9043) grad_norm 2.2775 (2.7875) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 01:41:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][260/625] eta 0:03:54 lr 0.000064 wd 0.0500 time 0.5848 (0.6424) data time 0.0007 (0.0057) model time 0.5841 (0.6368) loss 6.3599 (6.8969) grad_norm 1.9788 (2.7803) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 01:41:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][270/625] eta 0:03:46 lr 0.000064 wd 0.0500 time 0.5824 (0.6387) data time 0.0007 (0.0054) model time 0.5817 (0.6333) loss 6.8148 (6.8985) grad_norm 1.9945 (2.7893) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 01:41:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][280/625] eta 0:03:39 lr 0.000064 wd 0.0500 time 0.5822 (0.6353) data time 0.0009 (0.0051) model time 0.5813 (0.6301) loss 5.3698 (6.8866) grad_norm 1.7808 (2.7761) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 01:41:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][290/625] eta 0:03:31 lr 0.000064 wd 0.0500 time 0.5821 (0.6322) data time 0.0007 (0.0049) model time 0.5814 (0.6273) loss 7.4506 (6.8960) grad_norm 2.9225 (2.8479) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 01:41:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][300/625] eta 0:03:24 lr 0.000064 wd 0.0500 time 0.5853 (0.6294) data time 0.0006 (0.0047) model time 0.5847 (0.6247) loss 7.0123 (6.8979) grad_norm 2.3141 (2.8983) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 01:41:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][310/625] eta 0:03:17 lr 0.000064 wd 0.0500 time 0.5846 (0.6268) data time 0.0008 (0.0045) model time 0.5837 (0.6223) loss 7.0594 (6.8737) grad_norm 1.9569 (2.9236) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 01:41:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][320/625] eta 0:03:10 lr 0.000064 wd 0.0500 time 0.5887 (0.6246) data time 0.0007 (0.0043) model time 0.5880 (0.6203) loss 6.4248 (6.8726) grad_norm 1.8309 (2.9169) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 01:41:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][330/625] eta 0:03:03 lr 0.000064 wd 0.0500 time 0.5814 (0.6227) data time 0.0008 (0.0042) model time 0.5805 (0.6185) loss 7.8557 (6.8676) grad_norm 3.0561 (2.8923) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 01:41:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][340/625] eta 0:02:56 lr 0.000064 wd 0.0500 time 0.5817 (0.6208) data time 0.0007 (0.0040) model time 0.5811 (0.6167) loss 6.1993 (6.8643) grad_norm 2.4946 (2.8649) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 01:41:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][350/625] eta 0:02:50 lr 0.000064 wd 0.0500 time 0.5811 (0.6190) data time 0.0007 (0.0039) model time 0.5804 (0.6151) loss 5.3842 (6.8697) grad_norm 1.9614 (inf) loss_scale 128.0000 (253.3770) mem 22344MB +[2024-07-29 01:42:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][360/625] eta 0:02:43 lr 0.000064 wd 0.0500 time 0.5761 (0.6173) data time 0.0007 (0.0038) model time 0.5754 (0.6136) loss 6.0829 (6.8628) grad_norm 4.0593 (inf) loss_scale 128.0000 (248.4409) mem 22344MB +[2024-07-29 01:42:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][370/625] eta 0:02:37 lr 0.000064 wd 0.0500 time 0.5825 (0.6158) data time 0.0007 (0.0037) model time 0.5818 (0.6121) loss 7.4483 (6.8572) grad_norm 2.4485 (inf) loss_scale 128.0000 (243.8788) mem 22344MB +[2024-07-29 01:42:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][380/625] eta 0:02:30 lr 0.000064 wd 0.0500 time 0.7960 (0.6153) data time 0.0009 (0.0036) model time 0.7951 (0.6118) loss 6.5769 (6.8391) grad_norm 2.2506 (inf) loss_scale 128.0000 (239.6496) mem 22344MB +[2024-07-29 01:42:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][390/625] eta 0:02:24 lr 0.000063 wd 0.0500 time 0.5844 (0.6140) data time 0.0007 (0.0035) model time 0.5838 (0.6105) loss 6.1270 (6.8420) grad_norm 2.5189 (inf) loss_scale 128.0000 (235.7183) mem 22344MB +[2024-07-29 01:42:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][400/625] eta 0:02:17 lr 0.000063 wd 0.0500 time 0.5772 (0.6127) data time 0.0007 (0.0034) model time 0.5765 (0.6094) loss 6.8312 (6.8337) grad_norm 4.4839 (inf) loss_scale 128.0000 (232.0544) mem 22344MB +[2024-07-29 01:42:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][410/625] eta 0:02:11 lr 0.000063 wd 0.0500 time 0.5941 (0.6116) data time 0.0009 (0.0033) model time 0.5932 (0.6083) loss 7.2741 (6.8256) grad_norm 1.9634 (inf) loss_scale 128.0000 (228.6316) mem 22344MB +[2024-07-29 01:42:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][420/625] eta 0:02:05 lr 0.000063 wd 0.0500 time 0.5807 (0.6106) data time 0.0009 (0.0032) model time 0.5798 (0.6073) loss 7.5492 (6.8314) grad_norm 2.2202 (inf) loss_scale 128.0000 (225.4268) mem 22344MB +[2024-07-29 01:42:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][430/625] eta 0:01:58 lr 0.000063 wd 0.0500 time 0.5859 (0.6096) data time 0.0006 (0.0032) model time 0.5853 (0.6064) loss 6.5428 (6.8491) grad_norm 2.0146 (inf) loss_scale 128.0000 (222.4198) mem 22344MB +[2024-07-29 01:42:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][440/625] eta 0:01:52 lr 0.000063 wd 0.0500 time 0.5795 (0.6085) data time 0.0006 (0.0031) model time 0.5789 (0.6054) loss 6.4988 (6.8504) grad_norm 2.5895 (inf) loss_scale 128.0000 (219.5928) mem 22344MB +[2024-07-29 01:42:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][450/625] eta 0:01:46 lr 0.000063 wd 0.0500 time 0.5776 (0.6075) data time 0.0008 (0.0031) model time 0.5768 (0.6045) loss 7.2992 (6.8546) grad_norm 3.0350 (inf) loss_scale 128.0000 (216.9302) mem 22344MB +[2024-07-29 01:43:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][460/625] eta 0:01:40 lr 0.000063 wd 0.0500 time 0.5845 (0.6067) data time 0.0008 (0.0030) model time 0.5837 (0.6037) loss 7.2151 (6.8564) grad_norm 4.4391 (inf) loss_scale 128.0000 (214.4181) mem 22344MB +[2024-07-29 01:43:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][470/625] eta 0:01:33 lr 0.000063 wd 0.0500 time 0.5906 (0.6059) data time 0.0007 (0.0029) model time 0.5900 (0.6030) loss 5.8100 (6.8465) grad_norm 4.6686 (inf) loss_scale 128.0000 (212.0440) mem 22344MB +[2024-07-29 01:43:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][480/625] eta 0:01:27 lr 0.000063 wd 0.0500 time 0.5823 (0.6052) data time 0.0010 (0.0029) model time 0.5812 (0.6023) loss 7.0483 (6.8422) grad_norm 1.8538 (inf) loss_scale 128.0000 (209.7968) mem 22344MB +[2024-07-29 01:43:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][490/625] eta 0:01:21 lr 0.000063 wd 0.0500 time 0.5792 (0.6045) data time 0.0008 (0.0028) model time 0.5784 (0.6016) loss 6.0016 (6.8315) grad_norm 6.7885 (inf) loss_scale 128.0000 (207.6667) mem 22344MB +[2024-07-29 01:43:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][500/625] eta 0:01:15 lr 0.000063 wd 0.0500 time 0.5933 (0.6038) data time 0.0010 (0.0028) model time 0.5923 (0.6010) loss 6.3373 (6.8296) grad_norm 3.2073 (inf) loss_scale 128.0000 (205.6447) mem 22344MB +[2024-07-29 01:43:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][510/625] eta 0:01:09 lr 0.000063 wd 0.0500 time 0.5797 (0.6031) data time 0.0007 (0.0027) model time 0.5790 (0.6004) loss 6.7708 (6.8319) grad_norm 1.9536 (inf) loss_scale 128.0000 (203.7228) mem 22344MB +[2024-07-29 01:43:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][520/625] eta 0:01:03 lr 0.000063 wd 0.0500 time 0.5929 (0.6025) data time 0.0009 (0.0027) model time 0.5920 (0.5998) loss 5.6798 (6.8311) grad_norm 3.8029 (inf) loss_scale 128.0000 (201.8937) mem 22344MB +[2024-07-29 01:43:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 01:43:36 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 01:43:42 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 01:47:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 01:47:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 01:57:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 01:57:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 02:20:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 02:20:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 02:20:43 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 02:20:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 02:20:55 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 02:20:55 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 02:20:55 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 02:20:56 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 262) +[2024-07-29 02:20:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 02:35:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 02:35:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 02:35:39 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 02:35:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 02:35:49 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 02:35:50 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 02:35:50 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 02:35:50 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 262) +[2024-07-29 02:35:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 02:36:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][530/625] eta 0:02:44 lr 0.000063 wd 0.0500 time 0.5740 (1.7266) data time 0.0009 (0.0678) model time 0.5731 (1.6588) loss 7.6478 (7.1823) grad_norm 2.5391 (2.6143) loss_scale 128.0000 (128.0000) mem 22343MB +[2024-07-29 02:36:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][540/625] eta 0:01:37 lr 0.000063 wd 0.0500 time 0.5753 (1.1498) data time 0.0007 (0.0344) model time 0.5745 (1.1154) loss 7.1553 (7.0678) grad_norm 2.2047 (2.6932) loss_scale 128.0000 (128.0000) mem 22343MB +[2024-07-29 02:36:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][550/625] eta 0:01:11 lr 0.000063 wd 0.0500 time 0.5769 (0.9574) data time 0.0009 (0.0233) model time 0.5760 (0.9341) loss 7.4026 (7.0976) grad_norm 1.9155 (2.7340) loss_scale 128.0000 (128.0000) mem 22343MB +[2024-07-29 02:36:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][560/625] eta 0:00:55 lr 0.000063 wd 0.0500 time 0.5728 (0.8610) data time 0.0007 (0.0177) model time 0.5721 (0.8432) loss 5.9857 (6.9442) grad_norm 2.5787 (2.7168) loss_scale 128.0000 (128.0000) mem 22343MB +[2024-07-29 02:36:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][570/625] eta 0:00:44 lr 0.000063 wd 0.0500 time 0.5746 (0.8034) data time 0.0008 (0.0144) model time 0.5738 (0.7890) loss 6.1487 (6.9067) grad_norm 3.0074 (2.8051) loss_scale 128.0000 (128.0000) mem 22343MB +[2024-07-29 02:36:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][580/625] eta 0:00:34 lr 0.000063 wd 0.0500 time 0.5777 (0.7712) data time 0.0007 (0.0122) model time 0.5770 (0.7590) loss 7.3321 (6.8519) grad_norm 3.5657 (2.8150) loss_scale 128.0000 (128.0000) mem 22343MB +[2024-07-29 02:36:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][590/625] eta 0:00:26 lr 0.000063 wd 0.0500 time 0.5792 (0.7434) data time 0.0007 (0.0106) model time 0.5785 (0.7328) loss 6.0808 (6.8284) grad_norm 2.6185 (2.7778) loss_scale 128.0000 (128.0000) mem 22343MB +[2024-07-29 02:36:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][600/625] eta 0:00:18 lr 0.000063 wd 0.0500 time 0.5777 (0.7225) data time 0.0009 (0.0094) model time 0.5768 (0.7132) loss 7.0042 (6.8663) grad_norm 3.1319 (2.7278) loss_scale 128.0000 (128.0000) mem 22343MB +[2024-07-29 02:36:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][610/625] eta 0:00:10 lr 0.000063 wd 0.0500 time 0.5764 (0.7063) data time 0.0004 (0.0085) model time 0.5760 (0.6979) loss 7.4750 (6.8677) grad_norm 1.9573 (2.6813) loss_scale 128.0000 (128.0000) mem 22343MB +[2024-07-29 02:37:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [262/300][620/625] eta 0:00:03 lr 0.000062 wd 0.0500 time 0.5761 (0.6934) data time 0.0006 (0.0077) model time 0.5755 (0.6857) loss 8.1871 (6.8808) grad_norm 2.0624 (2.6453) loss_scale 128.0000 (128.0000) mem 22343MB +[2024-07-29 02:37:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 262 training takes 0:01:11 +[2024-07-29 02:37:06 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 02:37:11 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 02:37:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.465 (0.465) Loss 0.4995 (0.4995) Acc@1 90.186 (90.186) Acc@5 98.926 (98.926) Mem 22343MB +[2024-07-29 02:37:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.156) Loss 0.7412 (0.6023) Acc@1 83.252 (88.081) Acc@5 97.021 (98.158) Mem 22343MB +[2024-07-29 02:37:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.141) Loss 0.8164 (0.6872) Acc@1 81.201 (85.514) Acc@5 96.533 (97.377) Mem 22343MB +[2024-07-29 02:37:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.151 Acc@5 97.373 +[2024-07-29 02:37:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.2% +[2024-07-29 02:37:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.836 (0.836) Loss 0.4985 (0.4985) Acc@1 90.381 (90.381) Acc@5 98.926 (98.926) Mem 22343MB +[2024-07-29 02:37:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.124 (0.190) Loss 0.7383 (0.6047) Acc@1 83.203 (88.170) Acc@5 97.168 (98.180) Mem 22343MB +[2024-07-29 02:37:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.124 (0.162) Loss 0.8281 (0.6888) Acc@1 80.811 (85.507) Acc@5 96.240 (97.373) Mem 22343MB +[2024-07-29 02:37:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.121 Acc@5 97.377 +[2024-07-29 02:37:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.1% +[2024-07-29 02:37:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.12% +[2024-07-29 02:37:20 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 02:37:22 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 02:37:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][0/625] eta 0:10:31 lr 0.000062 wd 0.0500 time 1.0109 (1.0109) data time 0.3614 (0.3614) model time 0.0000 (0.0000) loss 6.0952 (6.0952) grad_norm 2.2056 (2.2056) loss_scale 128.0000 (128.0000) mem 22337MB +[2024-07-29 02:37:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][10/625] eta 0:06:21 lr 0.000062 wd 0.0500 time 0.5761 (0.6207) data time 0.0007 (0.0350) model time 0.0000 (0.0000) loss 6.0172 (6.8149) grad_norm 3.0183 (2.3616) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:37:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][20/625] eta 0:06:03 lr 0.000062 wd 0.0500 time 0.5762 (0.6001) data time 0.0007 (0.0188) model time 0.0000 (0.0000) loss 7.1159 (6.8788) grad_norm 2.9269 (2.5007) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:37:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][30/625] eta 0:05:52 lr 0.000062 wd 0.0500 time 0.5745 (0.5927) data time 0.0008 (0.0131) model time 0.0000 (0.0000) loss 5.9335 (6.8817) grad_norm 2.1660 (2.6387) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:37:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][40/625] eta 0:05:44 lr 0.000062 wd 0.0500 time 0.5780 (0.5885) data time 0.0008 (0.0101) model time 0.0000 (0.0000) loss 7.0255 (6.8046) grad_norm 2.4372 (2.5462) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:37:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][50/625] eta 0:05:39 lr 0.000062 wd 0.0500 time 0.5694 (0.5909) data time 0.0008 (0.0083) model time 0.0000 (0.0000) loss 7.5212 (6.8225) grad_norm 2.8505 (2.5283) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:37:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][60/625] eta 0:05:34 lr 0.000062 wd 0.0500 time 0.5612 (0.5914) data time 0.0009 (0.0071) model time 0.5603 (0.5929) loss 8.3937 (6.8300) grad_norm 2.5857 (2.5214) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:38:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][70/625] eta 0:05:26 lr 0.000062 wd 0.0500 time 0.5722 (0.5890) data time 0.0006 (0.0062) model time 0.5716 (0.5833) loss 6.3899 (6.8361) grad_norm 3.1047 (2.9792) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:38:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][80/625] eta 0:05:21 lr 0.000062 wd 0.0500 time 0.5734 (0.5897) data time 0.0009 (0.0056) model time 0.5724 (0.5866) loss 6.3181 (6.8192) grad_norm 2.6266 (2.9358) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:38:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][90/625] eta 0:05:15 lr 0.000062 wd 0.0500 time 0.5640 (0.5889) data time 0.0008 (0.0051) model time 0.5632 (0.5854) loss 6.0618 (6.7821) grad_norm 2.7033 (2.9075) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:38:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][100/625] eta 0:05:08 lr 0.000062 wd 0.0500 time 0.5748 (0.5881) data time 0.0008 (0.0047) model time 0.5740 (0.5843) loss 6.7108 (6.7511) grad_norm 1.8129 (2.8307) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:38:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][110/625] eta 0:05:02 lr 0.000062 wd 0.0500 time 0.5750 (0.5880) data time 0.0008 (0.0043) model time 0.5742 (0.5846) loss 7.0339 (6.7469) grad_norm 31.9674 (3.0512) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:38:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][120/625] eta 0:04:56 lr 0.000062 wd 0.0500 time 0.5655 (0.5879) data time 0.0007 (0.0040) model time 0.5648 (0.5848) loss 6.5857 (6.7790) grad_norm 2.5397 (3.0249) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:38:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][130/625] eta 0:04:51 lr 0.000062 wd 0.0500 time 0.5669 (0.5881) data time 0.0008 (0.0041) model time 0.5661 (0.5850) loss 7.5167 (6.7925) grad_norm 3.7404 (2.9870) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:38:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][140/625] eta 0:04:45 lr 0.000062 wd 0.0500 time 0.5649 (0.5885) data time 0.0010 (0.0039) model time 0.5639 (0.5857) loss 6.1585 (6.8081) grad_norm 2.0466 (2.9336) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:38:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][150/625] eta 0:04:39 lr 0.000062 wd 0.0500 time 0.5749 (0.5884) data time 0.0007 (0.0038) model time 0.5742 (0.5857) loss 5.7589 (6.7866) grad_norm 1.9347 (2.8931) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:38:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][160/625] eta 0:04:33 lr 0.000062 wd 0.0500 time 0.5692 (0.5877) data time 0.0006 (0.0036) model time 0.5686 (0.5848) loss 6.3234 (6.7715) grad_norm 3.4185 (2.8987) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:39:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][170/625] eta 0:04:27 lr 0.000062 wd 0.0500 time 0.5727 (0.5871) data time 0.0009 (0.0035) model time 0.5719 (0.5840) loss 7.6535 (6.7801) grad_norm 3.0310 (2.9255) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:39:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][180/625] eta 0:04:21 lr 0.000062 wd 0.0500 time 0.5748 (0.5873) data time 0.0008 (0.0033) model time 0.5740 (0.5845) loss 7.2018 (6.7967) grad_norm 1.7815 (2.8837) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:39:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][190/625] eta 0:04:15 lr 0.000062 wd 0.0500 time 0.5753 (0.5867) data time 0.0008 (0.0032) model time 0.5745 (0.5838) loss 7.1022 (6.7889) grad_norm 2.0529 (2.8681) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:39:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][200/625] eta 0:04:09 lr 0.000062 wd 0.0500 time 0.5734 (0.5871) data time 0.0008 (0.0035) model time 0.5726 (0.5841) loss 5.9362 (6.7746) grad_norm 3.1636 (2.8563) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:39:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][210/625] eta 0:04:03 lr 0.000062 wd 0.0500 time 0.5658 (0.5874) data time 0.0008 (0.0033) model time 0.5650 (0.5845) loss 7.8174 (6.7819) grad_norm 2.5447 (2.8556) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:39:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][220/625] eta 0:03:57 lr 0.000062 wd 0.0500 time 0.5705 (0.5869) data time 0.0006 (0.0032) model time 0.5699 (0.5840) loss 7.2178 (6.7963) grad_norm 2.0076 (2.8588) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:39:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][230/625] eta 0:03:51 lr 0.000061 wd 0.0500 time 0.5683 (0.5867) data time 0.0006 (0.0031) model time 0.5677 (0.5838) loss 6.2463 (6.7834) grad_norm 2.2536 (2.9109) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:39:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][240/625] eta 0:03:45 lr 0.000061 wd 0.0500 time 0.5720 (0.5869) data time 0.0007 (0.0030) model time 0.5713 (0.5843) loss 6.3910 (6.7974) grad_norm 2.3483 (2.8991) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:39:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][250/625] eta 0:03:39 lr 0.000061 wd 0.0500 time 0.5738 (0.5866) data time 0.0009 (0.0030) model time 0.5730 (0.5839) loss 6.8755 (6.7998) grad_norm 1.7584 (2.8723) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:39:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][260/625] eta 0:03:34 lr 0.000061 wd 0.0500 time 0.5706 (0.5865) data time 0.0008 (0.0029) model time 0.5698 (0.5839) loss 7.1479 (6.7931) grad_norm 2.5562 (2.8414) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:40:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][270/625] eta 0:03:28 lr 0.000061 wd 0.0500 time 0.5731 (0.5866) data time 0.0006 (0.0028) model time 0.5725 (0.5841) loss 6.0078 (6.7998) grad_norm 2.5698 (2.8309) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:40:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][280/625] eta 0:03:22 lr 0.000061 wd 0.0500 time 0.5653 (0.5865) data time 0.0006 (0.0028) model time 0.5646 (0.5839) loss 5.4822 (6.7894) grad_norm 2.7520 (2.8126) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:40:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][290/625] eta 0:03:16 lr 0.000061 wd 0.0500 time 0.5670 (0.5866) data time 0.0008 (0.0027) model time 0.5662 (0.5842) loss 6.5312 (6.7838) grad_norm 2.3749 (2.8015) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:40:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][300/625] eta 0:03:10 lr 0.000061 wd 0.0500 time 0.5655 (0.5865) data time 0.0007 (0.0026) model time 0.5648 (0.5842) loss 6.1694 (6.7783) grad_norm 2.6437 (2.8010) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:40:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][310/625] eta 0:03:04 lr 0.000061 wd 0.0500 time 0.5727 (0.5863) data time 0.0009 (0.0026) model time 0.5718 (0.5839) loss 6.8438 (6.7818) grad_norm 1.7955 (2.7853) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:40:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][320/625] eta 0:02:58 lr 0.000061 wd 0.0500 time 0.5739 (0.5859) data time 0.0006 (0.0025) model time 0.5732 (0.5835) loss 7.2541 (6.7846) grad_norm 2.5398 (2.7762) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:40:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][330/625] eta 0:02:52 lr 0.000061 wd 0.0500 time 0.5719 (0.5858) data time 0.0006 (0.0025) model time 0.5712 (0.5834) loss 6.6120 (6.7895) grad_norm 3.2324 (2.7671) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:40:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][340/625] eta 0:02:46 lr 0.000061 wd 0.0500 time 0.5731 (0.5855) data time 0.0009 (0.0025) model time 0.5722 (0.5831) loss 7.2050 (6.7901) grad_norm 1.7376 (2.7598) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:40:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][350/625] eta 0:02:40 lr 0.000061 wd 0.0500 time 0.5723 (0.5852) data time 0.0007 (0.0024) model time 0.5717 (0.5828) loss 7.8653 (6.7872) grad_norm 2.2755 (2.7555) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:40:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][360/625] eta 0:02:35 lr 0.000061 wd 0.0500 time 0.5762 (0.5849) data time 0.0007 (0.0024) model time 0.5756 (0.5826) loss 6.5691 (6.7772) grad_norm 3.9182 (2.7689) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:40:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][370/625] eta 0:02:29 lr 0.000061 wd 0.0500 time 0.5771 (0.5847) data time 0.0007 (0.0023) model time 0.5764 (0.5824) loss 6.3121 (6.7705) grad_norm 3.1075 (2.7708) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:41:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][380/625] eta 0:02:23 lr 0.000061 wd 0.0500 time 0.5784 (0.5846) data time 0.0006 (0.0023) model time 0.5778 (0.5822) loss 7.4717 (6.7755) grad_norm 2.1126 (2.7646) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:41:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][390/625] eta 0:02:17 lr 0.000061 wd 0.0500 time 0.5696 (0.5844) data time 0.0007 (0.0023) model time 0.5689 (0.5820) loss 6.8079 (6.7791) grad_norm 6.3388 (2.7720) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:41:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][400/625] eta 0:02:11 lr 0.000061 wd 0.0500 time 0.5680 (0.5846) data time 0.0009 (0.0022) model time 0.5671 (0.5823) loss 7.3743 (6.7783) grad_norm 2.2315 (2.7702) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:41:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][410/625] eta 0:02:05 lr 0.000061 wd 0.0500 time 0.5720 (0.5843) data time 0.0008 (0.0022) model time 0.5711 (0.5820) loss 5.9889 (6.7914) grad_norm 2.1513 (2.8090) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:41:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][420/625] eta 0:01:59 lr 0.000061 wd 0.0500 time 0.5747 (0.5841) data time 0.0006 (0.0022) model time 0.5740 (0.5819) loss 7.3998 (6.7866) grad_norm 3.6491 (2.8045) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:41:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][430/625] eta 0:01:53 lr 0.000061 wd 0.0500 time 0.5707 (0.5839) data time 0.0007 (0.0022) model time 0.5701 (0.5817) loss 6.3113 (6.7810) grad_norm 2.2007 (2.8000) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:41:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][440/625] eta 0:01:47 lr 0.000061 wd 0.0500 time 0.5728 (0.5837) data time 0.0006 (0.0021) model time 0.5722 (0.5815) loss 6.5581 (6.7864) grad_norm 4.6280 (2.8173) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:41:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][450/625] eta 0:01:42 lr 0.000061 wd 0.0500 time 0.5680 (0.5836) data time 0.0008 (0.0021) model time 0.5672 (0.5813) loss 6.8034 (6.7896) grad_norm 2.2689 (2.8143) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:41:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][460/625] eta 0:01:36 lr 0.000060 wd 0.0500 time 0.5725 (0.5834) data time 0.0009 (0.0021) model time 0.5716 (0.5812) loss 6.8722 (6.7924) grad_norm 2.2439 (2.8117) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:41:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][470/625] eta 0:01:30 lr 0.000060 wd 0.0500 time 0.5695 (0.5832) data time 0.0008 (0.0020) model time 0.5687 (0.5810) loss 7.4823 (6.7972) grad_norm 3.2267 (2.8059) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:42:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][480/625] eta 0:01:24 lr 0.000060 wd 0.0500 time 0.5677 (0.5830) data time 0.0009 (0.0020) model time 0.5668 (0.5808) loss 7.0883 (6.7993) grad_norm 2.6798 (2.7974) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:42:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][490/625] eta 0:01:18 lr 0.000060 wd 0.0500 time 0.5676 (0.5828) data time 0.0006 (0.0020) model time 0.5670 (0.5806) loss 7.2100 (6.8039) grad_norm 2.3145 (2.7916) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:42:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][500/625] eta 0:01:12 lr 0.000060 wd 0.0500 time 0.5759 (0.5827) data time 0.0007 (0.0020) model time 0.5753 (0.5805) loss 7.9030 (6.8010) grad_norm 1.7396 (2.7813) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:42:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][510/625] eta 0:01:06 lr 0.000060 wd 0.0500 time 0.5724 (0.5826) data time 0.0006 (0.0020) model time 0.5718 (0.5804) loss 7.3100 (6.8019) grad_norm 1.9491 (2.7700) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:42:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][520/625] eta 0:01:01 lr 0.000060 wd 0.0500 time 0.5738 (0.5825) data time 0.0009 (0.0019) model time 0.5729 (0.5803) loss 7.5021 (6.8046) grad_norm 2.8151 (2.7881) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:42:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][530/625] eta 0:00:55 lr 0.000060 wd 0.0500 time 0.5779 (0.5824) data time 0.0008 (0.0019) model time 0.5771 (0.5803) loss 5.7519 (6.8030) grad_norm 1.9645 (2.8004) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:42:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][540/625] eta 0:00:49 lr 0.000060 wd 0.0500 time 0.5757 (0.5823) data time 0.0006 (0.0019) model time 0.5751 (0.5802) loss 5.6053 (6.7965) grad_norm 2.3571 (2.7995) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:42:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][550/625] eta 0:00:43 lr 0.000060 wd 0.0500 time 0.5767 (0.5825) data time 0.0006 (0.0019) model time 0.5761 (0.5803) loss 6.8701 (6.7966) grad_norm 3.0358 (2.7908) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:42:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][560/625] eta 0:00:37 lr 0.000060 wd 0.0500 time 0.5735 (0.5823) data time 0.0007 (0.0019) model time 0.5729 (0.5802) loss 5.9805 (6.7887) grad_norm 2.4256 (2.7908) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:42:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][570/625] eta 0:00:32 lr 0.000060 wd 0.0500 time 0.5765 (0.5822) data time 0.0006 (0.0019) model time 0.5759 (0.5801) loss 6.6272 (6.7895) grad_norm 1.4437 (2.7809) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:43:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][580/625] eta 0:00:26 lr 0.000060 wd 0.0500 time 0.5731 (0.5821) data time 0.0008 (0.0019) model time 0.5722 (0.5801) loss 7.4495 (6.7870) grad_norm 1.8050 (2.7727) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:43:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][590/625] eta 0:00:20 lr 0.000060 wd 0.0500 time 0.5676 (0.5821) data time 0.0008 (0.0018) model time 0.5667 (0.5800) loss 6.2544 (6.7870) grad_norm 4.0494 (2.7717) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:43:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][600/625] eta 0:00:14 lr 0.000060 wd 0.0500 time 0.5761 (0.5820) data time 0.0006 (0.0018) model time 0.5754 (0.5799) loss 7.1836 (6.7856) grad_norm 1.9889 (2.7676) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:43:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][610/625] eta 0:00:08 lr 0.000060 wd 0.0500 time 0.7607 (0.5822) data time 0.0006 (0.0018) model time 0.7601 (0.5802) loss 8.2135 (6.7833) grad_norm 1.9391 (2.7721) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:43:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [263/300][620/625] eta 0:00:02 lr 0.000060 wd 0.0500 time 0.5733 (0.5821) data time 0.0004 (0.0018) model time 0.5729 (0.5801) loss 6.8055 (6.7818) grad_norm 2.6693 (2.7695) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:43:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 263 training takes 0:06:03 +[2024-07-29 02:43:26 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 02:43:29 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 02:43:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.470 (0.470) Loss 0.4902 (0.4902) Acc@1 90.234 (90.234) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-29 02:43:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.124 (0.157) Loss 0.7314 (0.5947) Acc@1 83.398 (88.246) Acc@5 97.168 (98.153) Mem 22339MB +[2024-07-29 02:43:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8169 (0.6806) Acc@1 80.762 (85.561) Acc@5 96.387 (97.356) Mem 22339MB +[2024-07-29 02:43:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.187 Acc@5 97.355 +[2024-07-29 02:43:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.2% +[2024-07-29 02:43:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.815 (0.815) Loss 0.4985 (0.4985) Acc@1 90.332 (90.332) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-29 02:43:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.189) Loss 0.7363 (0.6041) Acc@1 83.252 (88.201) Acc@5 97.217 (98.184) Mem 22339MB +[2024-07-29 02:43:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.159) Loss 0.8271 (0.6882) Acc@1 80.811 (85.528) Acc@5 96.240 (97.382) Mem 22339MB +[2024-07-29 02:43:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.133 Acc@5 97.383 +[2024-07-29 02:43:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.1% +[2024-07-29 02:43:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.13% +[2024-07-29 02:43:36 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 02:43:41 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 02:43:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][0/625] eta 0:09:24 lr 0.000060 wd 0.0500 time 0.9037 (0.9037) data time 0.3814 (0.3814) model time 0.0000 (0.0000) loss 7.6680 (7.6680) grad_norm 1.7792 (1.7792) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:43:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][10/625] eta 0:06:11 lr 0.000060 wd 0.0500 time 0.5740 (0.6046) data time 0.0008 (0.0355) model time 0.0000 (0.0000) loss 7.2526 (7.3163) grad_norm 2.1135 (2.7281) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:43:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][20/625] eta 0:05:57 lr 0.000060 wd 0.0500 time 0.5741 (0.5910) data time 0.0008 (0.0190) model time 0.0000 (0.0000) loss 6.2271 (7.0793) grad_norm 2.1286 (2.6250) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 02:43:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 02:43:57 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 02:44:00 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 02:48:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 02:48:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 02:49:06 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 02:49:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 02:49:19 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 02:49:20 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 02:49:20 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 02:49:20 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 264) +[2024-07-29 02:49:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 02:49:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][30/625] eta 0:58:27 lr 0.000060 wd 0.0500 time 0.5173 (5.8953) data time 0.0007 (0.2460) model time 0.0000 (0.0000) loss 6.5266 (7.0101) grad_norm 2.3797 (2.3208) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 02:49:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][40/625] eta 0:17:12 lr 0.000060 wd 0.0500 time 0.5162 (1.7647) data time 0.0009 (0.0575) model time 0.0000 (0.0000) loss 7.0953 (6.9221) grad_norm 1.8083 (2.5099) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 02:49:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][50/625] eta 0:11:45 lr 0.000060 wd 0.0500 time 0.5178 (1.2263) data time 0.0007 (0.0329) model time 0.0000 (0.0000) loss 7.2286 (7.0295) grad_norm 2.1658 (2.3290) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 02:49:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][60/625] eta 0:09:32 lr 0.000060 wd 0.0500 time 0.5175 (1.0141) data time 0.0008 (0.0232) model time 0.5166 (0.5253) loss 6.2963 (7.0779) grad_norm 3.1110 (2.3371) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 02:50:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 02:50:02 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 02:50:10 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 04:55:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 04:55:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 04:56:14 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 04:56:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 04:56:28 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 04:56:29 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 04:56:29 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 04:56:29 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 264) +[2024-07-29 04:56:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 04:56:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][70/625] eta 1:02:00 lr 0.000060 wd 0.0500 time 1.8736 (6.7041) data time 0.0010 (0.4676) model time 1.8726 (6.2366) loss 7.6896 (7.2811) grad_norm 2.7441 (2.4215) loss_scale 128.0000 (128.0000) mem 22342MB +[2024-07-29 04:56:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][80/625] eta 0:14:41 lr 0.000059 wd 0.0500 time 0.6194 (1.6171) data time 0.0008 (0.0789) model time 0.6186 (1.5382) loss 6.1763 (7.1435) grad_norm 2.0187 (2.5378) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:56:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][90/625] eta 0:10:16 lr 0.000059 wd 0.0500 time 0.6096 (1.1515) data time 0.0011 (0.0436) model time 0.6085 (1.1079) loss 7.1799 (7.1583) grad_norm 2.0746 (2.3785) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:57:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][100/625] eta 0:08:35 lr 0.000059 wd 0.0500 time 0.5905 (0.9821) data time 0.0008 (0.0303) model time 0.5896 (0.9518) loss 6.9125 (7.1341) grad_norm 2.4814 (2.5455) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:57:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][110/625] eta 0:07:37 lr 0.000059 wd 0.0500 time 0.5893 (0.8881) data time 0.0011 (0.0234) model time 0.5883 (0.8646) loss 7.6340 (7.1208) grad_norm 2.0781 (2.8992) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:57:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][120/625] eta 0:07:02 lr 0.000059 wd 0.0500 time 0.6185 (0.8359) data time 0.0008 (0.0192) model time 0.6177 (0.8167) loss 6.6471 (7.0921) grad_norm 2.4443 (3.0633) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:57:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][130/625] eta 0:06:36 lr 0.000059 wd 0.0500 time 0.5966 (0.8006) data time 0.0009 (0.0163) model time 0.5958 (0.7843) loss 8.0428 (7.0645) grad_norm 2.0065 (2.9986) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:57:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][140/625] eta 0:06:15 lr 0.000059 wd 0.0500 time 0.7017 (0.7750) data time 0.0010 (0.0141) model time 0.7007 (0.7608) loss 6.9094 (6.9891) grad_norm 1.9893 (2.9701) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:57:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][150/625] eta 0:05:58 lr 0.000059 wd 0.0500 time 0.6174 (0.7539) data time 0.0010 (0.0126) model time 0.6164 (0.7413) loss 8.4601 (6.9898) grad_norm 1.6697 (2.9444) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:57:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][160/625] eta 0:05:42 lr 0.000059 wd 0.0500 time 0.5993 (0.7370) data time 0.0008 (0.0114) model time 0.5985 (0.7256) loss 5.6769 (6.9454) grad_norm 1.8672 (2.9073) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:57:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][170/625] eta 0:05:28 lr 0.000059 wd 0.0500 time 0.5951 (0.7230) data time 0.0008 (0.0104) model time 0.5943 (0.7126) loss 8.1384 (6.9634) grad_norm 2.1646 (2.8669) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:57:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][180/625] eta 0:05:16 lr 0.000059 wd 0.0500 time 0.5886 (0.7114) data time 0.0011 (0.0095) model time 0.5874 (0.7018) loss 7.7902 (6.9292) grad_norm 2.4887 (2.8484) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:57:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][190/625] eta 0:05:05 lr 0.000059 wd 0.0500 time 0.5890 (0.7015) data time 0.0009 (0.0089) model time 0.5881 (0.6927) loss 6.4619 (6.9093) grad_norm 2.4524 (2.8111) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:58:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][200/625] eta 0:04:54 lr 0.000059 wd 0.0500 time 0.5984 (0.6934) data time 0.0011 (0.0083) model time 0.5974 (0.6852) loss 6.9972 (6.9165) grad_norm 2.8688 (2.7886) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:58:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][210/625] eta 0:04:45 lr 0.000059 wd 0.0500 time 0.5959 (0.6868) data time 0.0011 (0.0078) model time 0.5948 (0.6790) loss 6.6924 (6.8934) grad_norm 1.8082 (2.7621) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:58:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][220/625] eta 0:04:35 lr 0.000059 wd 0.0500 time 0.5983 (0.6810) data time 0.0011 (0.0073) model time 0.5972 (0.6737) loss 7.1062 (6.8567) grad_norm 2.6604 (2.7264) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:58:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][230/625] eta 0:04:27 lr 0.000059 wd 0.0500 time 0.6013 (0.6760) data time 0.0011 (0.0069) model time 0.6002 (0.6691) loss 7.4101 (6.8602) grad_norm 2.5921 (2.7278) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:58:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][240/625] eta 0:04:18 lr 0.000059 wd 0.0500 time 0.5958 (0.6714) data time 0.0010 (0.0066) model time 0.5948 (0.6648) loss 6.9252 (6.8538) grad_norm 2.2767 (2.7127) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:58:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][250/625] eta 0:04:10 lr 0.000059 wd 0.0500 time 0.5942 (0.6672) data time 0.0010 (0.0063) model time 0.5932 (0.6609) loss 7.8628 (6.8567) grad_norm 3.0222 (2.7033) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:58:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][260/625] eta 0:04:02 lr 0.000059 wd 0.0500 time 0.5975 (0.6636) data time 0.0011 (0.0061) model time 0.5964 (0.6575) loss 7.3671 (6.8506) grad_norm 2.1199 (2.7077) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:58:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][270/625] eta 0:03:54 lr 0.000059 wd 0.0500 time 0.5975 (0.6602) data time 0.0008 (0.0059) model time 0.5967 (0.6543) loss 7.3123 (6.8426) grad_norm 2.7686 (2.6911) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:58:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][280/625] eta 0:03:46 lr 0.000059 wd 0.0500 time 0.6025 (0.6572) data time 0.0010 (0.0056) model time 0.6015 (0.6516) loss 6.6554 (6.8269) grad_norm 3.0877 (2.6760) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:58:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][290/625] eta 0:03:39 lr 0.000059 wd 0.0500 time 0.6086 (0.6547) data time 0.0009 (0.0054) model time 0.6077 (0.6493) loss 7.5056 (6.8156) grad_norm 1.8031 (2.6590) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:59:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][300/625] eta 0:03:32 lr 0.000059 wd 0.0500 time 0.6014 (0.6524) data time 0.0008 (0.0053) model time 0.6006 (0.6472) loss 6.1328 (6.8219) grad_norm 2.5234 (2.6456) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:59:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][310/625] eta 0:03:24 lr 0.000059 wd 0.0500 time 0.5952 (0.6503) data time 0.0008 (0.0051) model time 0.5944 (0.6452) loss 6.9823 (6.8114) grad_norm 1.9730 (2.6351) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:59:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][320/625] eta 0:03:17 lr 0.000058 wd 0.0500 time 0.6000 (0.6481) data time 0.0009 (0.0049) model time 0.5991 (0.6431) loss 7.4699 (6.7993) grad_norm 2.0816 (2.6199) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:59:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][330/625] eta 0:03:10 lr 0.000058 wd 0.0500 time 0.5954 (0.6460) data time 0.0011 (0.0048) model time 0.5943 (0.6412) loss 7.1347 (6.7900) grad_norm 2.8331 (2.6375) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:59:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][340/625] eta 0:03:03 lr 0.000058 wd 0.0500 time 0.7306 (0.6446) data time 0.0010 (0.0046) model time 0.7297 (0.6400) loss 6.1251 (6.7788) grad_norm 1.8732 (2.6214) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:59:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][350/625] eta 0:02:57 lr 0.000058 wd 0.0500 time 0.6041 (0.6436) data time 0.0010 (0.0045) model time 0.6031 (0.6391) loss 5.7054 (6.7906) grad_norm 2.4426 (2.5982) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:59:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][360/625] eta 0:02:50 lr 0.000058 wd 0.0500 time 0.6011 (0.6422) data time 0.0010 (0.0044) model time 0.6001 (0.6378) loss 6.6498 (6.7955) grad_norm 2.5278 (2.5942) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:59:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][370/625] eta 0:02:43 lr 0.000058 wd 0.0500 time 0.6006 (0.6409) data time 0.0010 (0.0043) model time 0.5996 (0.6366) loss 7.5627 (6.7839) grad_norm 3.0636 (2.5919) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:59:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][380/625] eta 0:02:36 lr 0.000058 wd 0.0500 time 0.5978 (0.6395) data time 0.0010 (0.0042) model time 0.5968 (0.6353) loss 7.4854 (6.7843) grad_norm 1.8214 (2.5953) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 04:59:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][390/625] eta 0:02:29 lr 0.000058 wd 0.0500 time 0.5922 (0.6381) data time 0.0010 (0.0041) model time 0.5912 (0.6340) loss 7.4161 (6.8010) grad_norm 3.9977 (2.5959) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:00:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][400/625] eta 0:02:23 lr 0.000058 wd 0.0500 time 0.5917 (0.6367) data time 0.0010 (0.0040) model time 0.5907 (0.6327) loss 7.6197 (6.8008) grad_norm 2.2952 (2.5878) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:00:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][410/625] eta 0:02:16 lr 0.000058 wd 0.0500 time 0.5956 (0.6354) data time 0.0009 (0.0039) model time 0.5947 (0.6315) loss 7.5783 (6.8008) grad_norm 2.0419 (2.6149) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:00:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][420/625] eta 0:02:10 lr 0.000058 wd 0.0500 time 0.5959 (0.6342) data time 0.0008 (0.0038) model time 0.5951 (0.6303) loss 8.3022 (6.8046) grad_norm 2.3774 (2.6188) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:00:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][430/625] eta 0:02:03 lr 0.000058 wd 0.0500 time 0.5990 (0.6332) data time 0.0008 (0.0038) model time 0.5982 (0.6294) loss 7.3108 (6.8018) grad_norm 2.5071 (2.6221) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:00:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][440/625] eta 0:01:56 lr 0.000058 wd 0.0500 time 0.6003 (0.6323) data time 0.0010 (0.0037) model time 0.5993 (0.6286) loss 7.3325 (6.7989) grad_norm 2.1439 (2.6302) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:00:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][450/625] eta 0:01:50 lr 0.000058 wd 0.0500 time 0.6002 (0.6314) data time 0.0008 (0.0036) model time 0.5993 (0.6278) loss 6.9875 (6.7977) grad_norm 2.2392 (2.6183) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:00:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][460/625] eta 0:01:44 lr 0.000058 wd 0.0500 time 0.5911 (0.6305) data time 0.0007 (0.0036) model time 0.5904 (0.6269) loss 6.2712 (6.7906) grad_norm 3.3357 (2.6155) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:00:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][470/625] eta 0:01:37 lr 0.000058 wd 0.0500 time 0.5936 (0.6296) data time 0.0008 (0.0035) model time 0.5928 (0.6261) loss 7.4462 (6.7978) grad_norm 2.3852 (2.6162) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:00:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][480/625] eta 0:01:31 lr 0.000058 wd 0.0500 time 0.5938 (0.6287) data time 0.0011 (0.0034) model time 0.5927 (0.6253) loss 6.4460 (6.7993) grad_norm 2.0178 (2.6071) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:00:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][490/625] eta 0:01:24 lr 0.000058 wd 0.0500 time 0.5917 (0.6280) data time 0.0008 (0.0034) model time 0.5910 (0.6247) loss 6.7899 (6.7980) grad_norm 1.7870 (2.5951) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:01:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][500/625] eta 0:01:18 lr 0.000058 wd 0.0500 time 0.6001 (0.6274) data time 0.0008 (0.0033) model time 0.5993 (0.6240) loss 5.8569 (6.7996) grad_norm 2.4175 (2.6117) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:01:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][510/625] eta 0:01:12 lr 0.000058 wd 0.0500 time 0.6100 (0.6268) data time 0.0008 (0.0033) model time 0.6092 (0.6235) loss 6.7505 (6.8033) grad_norm 2.1599 (2.6614) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:01:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][520/625] eta 0:01:05 lr 0.000058 wd 0.0500 time 0.5982 (0.6262) data time 0.0009 (0.0033) model time 0.5972 (0.6230) loss 5.9313 (6.7999) grad_norm 4.3395 (2.6639) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:01:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][530/625] eta 0:00:59 lr 0.000058 wd 0.0500 time 0.5998 (0.6256) data time 0.0008 (0.0032) model time 0.5990 (0.6224) loss 6.0462 (6.7977) grad_norm 2.1297 (2.6596) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:01:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][540/625] eta 0:00:53 lr 0.000058 wd 0.0500 time 0.5936 (0.6250) data time 0.0010 (0.0032) model time 0.5926 (0.6218) loss 7.3862 (6.7907) grad_norm 2.6216 (2.6758) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:01:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][550/625] eta 0:00:46 lr 0.000058 wd 0.0500 time 0.5966 (0.6244) data time 0.0008 (0.0031) model time 0.5958 (0.6213) loss 7.3659 (6.7859) grad_norm 2.4488 (2.6709) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:01:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][560/625] eta 0:00:40 lr 0.000057 wd 0.0500 time 0.5936 (0.6243) data time 0.0010 (0.0031) model time 0.5926 (0.6212) loss 7.9516 (6.7927) grad_norm 2.9456 (2.7429) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:01:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][570/625] eta 0:00:34 lr 0.000057 wd 0.0500 time 0.6008 (0.6245) data time 0.0010 (0.0031) model time 0.5998 (0.6214) loss 7.0193 (6.7878) grad_norm 2.0029 (2.7405) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:01:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][580/625] eta 0:00:28 lr 0.000057 wd 0.0500 time 0.6042 (0.6242) data time 0.0008 (0.0031) model time 0.6035 (0.6211) loss 7.9613 (6.8017) grad_norm 4.4510 (2.7359) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:01:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][590/625] eta 0:00:21 lr 0.000057 wd 0.0500 time 0.6013 (0.6239) data time 0.0008 (0.0031) model time 0.6004 (0.6208) loss 6.4019 (6.7981) grad_norm 3.0213 (2.7294) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:02:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][600/625] eta 0:00:15 lr 0.000057 wd 0.0500 time 0.5990 (0.6234) data time 0.0010 (0.0031) model time 0.5980 (0.6204) loss 6.6614 (6.7927) grad_norm 3.4744 (2.7313) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:02:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][610/625] eta 0:00:09 lr 0.000057 wd 0.0500 time 0.5913 (0.6229) data time 0.0005 (0.0030) model time 0.5908 (0.6199) loss 6.1725 (6.7897) grad_norm 2.2539 (2.7294) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:02:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [264/300][620/625] eta 0:00:03 lr 0.000057 wd 0.0500 time 0.5922 (0.6225) data time 0.0005 (0.0030) model time 0.5916 (0.6196) loss 7.5898 (6.7872) grad_norm 3.2687 (2.7380) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:02:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 264 training takes 0:05:46 +[2024-07-29 05:02:19 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 05:02:28 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 05:02:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.513 (0.513) Loss 0.4971 (0.4971) Acc@1 90.381 (90.381) Acc@5 98.975 (98.975) Mem 22341MB +[2024-07-29 05:02:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.162) Loss 0.7397 (0.6005) Acc@1 82.861 (88.055) Acc@5 97.363 (98.122) Mem 22341MB +[2024-07-29 05:02:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.147) Loss 0.8159 (0.6843) Acc@1 80.713 (85.519) Acc@5 96.436 (97.398) Mem 22341MB +[2024-07-29 05:02:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.151 Acc@5 97.393 +[2024-07-29 05:02:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.2% +[2024-07-29 05:02:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.814 (0.814) Loss 0.4985 (0.4985) Acc@1 90.332 (90.332) Acc@5 98.975 (98.975) Mem 22341MB +[2024-07-29 05:02:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.189) Loss 0.7363 (0.6040) Acc@1 83.203 (88.179) Acc@5 97.168 (98.176) Mem 22341MB +[2024-07-29 05:02:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.160) Loss 0.8257 (0.6879) Acc@1 80.811 (85.517) Acc@5 96.289 (97.389) Mem 22341MB +[2024-07-29 05:02:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.123 Acc@5 97.391 +[2024-07-29 05:02:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.1% +[2024-07-29 05:02:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.12% +[2024-07-29 05:02:38 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 05:02:42 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 05:02:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][0/625] eta 0:11:58 lr 0.000057 wd 0.0500 time 1.1496 (1.1496) data time 0.4384 (0.4384) model time 0.0000 (0.0000) loss 6.2751 (6.2751) grad_norm 1.9048 (1.9048) loss_scale 128.0000 (128.0000) mem 22337MB +[2024-07-29 05:02:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][10/625] eta 0:06:36 lr 0.000057 wd 0.0500 time 0.5916 (0.6446) data time 0.0010 (0.0410) model time 0.0000 (0.0000) loss 7.7341 (6.9364) grad_norm 2.5464 (2.8257) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:02:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][20/625] eta 0:06:15 lr 0.000057 wd 0.0500 time 0.5931 (0.6202) data time 0.0009 (0.0220) model time 0.0000 (0.0000) loss 7.5319 (6.7190) grad_norm 2.4786 (2.6986) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:03:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][30/625] eta 0:06:05 lr 0.000057 wd 0.0500 time 0.5988 (0.6142) data time 0.0009 (0.0157) model time 0.0000 (0.0000) loss 7.3493 (6.7062) grad_norm 2.0810 (2.7097) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:03:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][40/625] eta 0:05:57 lr 0.000057 wd 0.0500 time 0.5888 (0.6110) data time 0.0008 (0.0121) model time 0.0000 (0.0000) loss 8.2256 (6.7597) grad_norm 2.4230 (2.6636) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:03:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][50/625] eta 0:05:49 lr 0.000057 wd 0.0500 time 0.5882 (0.6072) data time 0.0011 (0.0102) model time 0.0000 (0.0000) loss 7.2259 (6.7508) grad_norm 2.1699 (2.5795) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:03:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][60/625] eta 0:05:42 lr 0.000057 wd 0.0500 time 0.5958 (0.6053) data time 0.0010 (0.0087) model time 0.5948 (0.5947) loss 8.7158 (6.7833) grad_norm 4.3210 (2.6060) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:03:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][70/625] eta 0:05:36 lr 0.000057 wd 0.0500 time 0.5983 (0.6065) data time 0.0008 (0.0077) model time 0.5975 (0.6033) loss 6.2489 (6.7569) grad_norm 2.1592 (2.5527) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:03:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][80/625] eta 0:05:30 lr 0.000057 wd 0.0500 time 0.6423 (0.6059) data time 0.0009 (0.0069) model time 0.6414 (0.6023) loss 6.2388 (6.7684) grad_norm 2.1165 (2.5260) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:03:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][90/625] eta 0:05:23 lr 0.000057 wd 0.0500 time 0.5958 (0.6056) data time 0.0008 (0.0063) model time 0.5950 (0.6022) loss 5.6688 (6.7299) grad_norm 2.1943 (3.0727) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:03:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][100/625] eta 0:05:17 lr 0.000057 wd 0.0500 time 0.5937 (0.6046) data time 0.0010 (0.0058) model time 0.5926 (0.6007) loss 5.9490 (6.7406) grad_norm 2.5958 (3.0836) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:03:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][110/625] eta 0:05:11 lr 0.000057 wd 0.0500 time 0.5948 (0.6041) data time 0.0011 (0.0053) model time 0.5937 (0.6003) loss 7.0184 (6.7610) grad_norm 2.9028 (3.0903) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:03:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][120/625] eta 0:05:04 lr 0.000057 wd 0.0500 time 0.5993 (0.6034) data time 0.0008 (0.0051) model time 0.5985 (0.5993) loss 6.4820 (6.7720) grad_norm 2.3587 (3.0282) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:04:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][130/625] eta 0:04:58 lr 0.000057 wd 0.0500 time 0.5966 (0.6030) data time 0.0008 (0.0049) model time 0.5958 (0.5988) loss 7.1631 (6.7762) grad_norm 2.3073 (2.9703) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:04:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][140/625] eta 0:04:52 lr 0.000057 wd 0.0500 time 0.5953 (0.6025) data time 0.0011 (0.0046) model time 0.5941 (0.5983) loss 5.8385 (6.7569) grad_norm 1.8417 (2.9579) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:04:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 05:04:11 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 05:04:15 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 05:06:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 05:06:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 05:08:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 05:08:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 05:08:51 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 05:09:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 05:09:06 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 05:09:07 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 05:09:07 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 05:09:07 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 265) +[2024-07-29 05:09:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 05:09:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][150/625] eta 0:36:41 lr 0.000057 wd 0.0500 time 0.6011 (4.6355) data time 0.0009 (0.3149) model time 0.6002 (4.3206) loss 6.1959 (6.8533) grad_norm 2.1260 (2.1831) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:09:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][160/625] eta 0:11:53 lr 0.000057 wd 0.0500 time 0.6026 (1.5347) data time 0.0010 (0.0735) model time 0.6016 (1.4612) loss 6.5108 (6.9062) grad_norm 2.6869 (2.4064) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:09:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][170/625] eta 0:08:33 lr 0.000057 wd 0.0500 time 0.5999 (1.1297) data time 0.0008 (0.0420) model time 0.5991 (1.0876) loss 6.2620 (6.9562) grad_norm 2.4378 (2.3847) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:09:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][180/625] eta 0:07:11 lr 0.000056 wd 0.0500 time 0.5988 (0.9686) data time 0.0010 (0.0296) model time 0.5978 (0.9390) loss 7.8383 (7.0468) grad_norm 2.1239 (2.4249) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:09:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][190/625] eta 0:06:24 lr 0.000056 wd 0.0500 time 0.5979 (0.8828) data time 0.0010 (0.0230) model time 0.5968 (0.8598) loss 7.7177 (6.9746) grad_norm 3.6085 (2.4328) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:09:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][200/625] eta 0:05:53 lr 0.000056 wd 0.0500 time 0.5892 (0.8319) data time 0.0010 (0.0189) model time 0.5882 (0.8131) loss 6.8121 (6.9161) grad_norm 2.7065 (2.6324) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:10:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][210/625] eta 0:05:31 lr 0.000056 wd 0.0500 time 0.6038 (0.7991) data time 0.0011 (0.0160) model time 0.6027 (0.7831) loss 6.0871 (6.8564) grad_norm 3.6373 (2.6603) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:10:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][220/625] eta 0:05:12 lr 0.000056 wd 0.0500 time 0.6025 (0.7728) data time 0.0012 (0.0140) model time 0.6014 (0.7588) loss 7.2364 (6.8327) grad_norm 3.5011 (2.6725) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:10:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][230/625] eta 0:04:57 lr 0.000056 wd 0.0500 time 0.6088 (0.7530) data time 0.0008 (0.0124) model time 0.6079 (0.7406) loss 6.9280 (6.8236) grad_norm 2.2936 (2.7458) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:10:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][240/625] eta 0:04:43 lr 0.000056 wd 0.0500 time 0.6092 (0.7374) data time 0.0008 (0.0112) model time 0.6084 (0.7262) loss 7.7939 (6.8168) grad_norm 1.9412 (2.7475) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:10:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][250/625] eta 0:04:31 lr 0.000056 wd 0.0500 time 0.6012 (0.7243) data time 0.0009 (0.0103) model time 0.6003 (0.7141) loss 6.9723 (6.8551) grad_norm 2.1396 (2.6979) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:10:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][260/625] eta 0:04:20 lr 0.000056 wd 0.0500 time 0.5968 (0.7134) data time 0.0008 (0.0095) model time 0.5959 (0.7039) loss 6.0776 (6.8299) grad_norm 1.6536 (2.6581) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:10:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][270/625] eta 0:04:10 lr 0.000056 wd 0.0500 time 0.5986 (0.7043) data time 0.0011 (0.0088) model time 0.5975 (0.6955) loss 6.8373 (6.8356) grad_norm 2.8113 (2.6505) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:10:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][280/625] eta 0:04:00 lr 0.000056 wd 0.0500 time 0.6142 (0.6965) data time 0.0010 (0.0082) model time 0.6132 (0.6883) loss 7.6752 (6.8312) grad_norm 2.4622 (2.6293) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:10:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][290/625] eta 0:03:51 lr 0.000056 wd 0.0500 time 0.6085 (0.6903) data time 0.0011 (0.0077) model time 0.6074 (0.6826) loss 8.4727 (6.8393) grad_norm 2.1762 (2.6415) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:10:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][300/625] eta 0:03:42 lr 0.000056 wd 0.0500 time 0.6075 (0.6849) data time 0.0008 (0.0073) model time 0.6067 (0.6777) loss 6.7388 (6.8407) grad_norm 2.8025 (2.6316) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:11:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][310/625] eta 0:03:34 lr 0.000056 wd 0.0500 time 0.6041 (0.6802) data time 0.0008 (0.0069) model time 0.6033 (0.6733) loss 5.8157 (6.8400) grad_norm 3.1914 (2.6786) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:11:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][320/625] eta 0:03:26 lr 0.000056 wd 0.0500 time 0.5942 (0.6758) data time 0.0010 (0.0065) model time 0.5932 (0.6692) loss 7.7267 (6.8531) grad_norm 1.8817 (2.6727) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:11:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][330/625] eta 0:03:18 lr 0.000056 wd 0.0500 time 0.6013 (0.6717) data time 0.0011 (0.0062) model time 0.6002 (0.6655) loss 7.3416 (6.8518) grad_norm 2.0671 (2.7274) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:11:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][340/625] eta 0:03:10 lr 0.000056 wd 0.0500 time 0.5982 (0.6680) data time 0.0008 (0.0060) model time 0.5974 (0.6620) loss 6.3689 (6.8362) grad_norm 1.9978 (2.7628) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:11:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][350/625] eta 0:03:02 lr 0.000056 wd 0.0500 time 0.5986 (0.6646) data time 0.0009 (0.0057) model time 0.5978 (0.6588) loss 5.7968 (6.8222) grad_norm 2.8071 (2.7604) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:11:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][360/625] eta 0:02:55 lr 0.000056 wd 0.0500 time 0.6061 (0.6618) data time 0.0011 (0.0055) model time 0.6050 (0.6563) loss 6.1494 (6.8231) grad_norm 2.7232 (2.7562) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:11:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][370/625] eta 0:02:48 lr 0.000056 wd 0.0500 time 0.6074 (0.6594) data time 0.0008 (0.0053) model time 0.6066 (0.6541) loss 6.5747 (6.8208) grad_norm 1.9506 (2.7544) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:11:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][380/625] eta 0:02:41 lr 0.000056 wd 0.0500 time 0.6078 (0.6572) data time 0.0010 (0.0051) model time 0.6068 (0.6521) loss 6.0895 (6.8210) grad_norm 1.9384 (2.7441) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:11:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][390/625] eta 0:02:33 lr 0.000056 wd 0.0500 time 0.5965 (0.6551) data time 0.0011 (0.0050) model time 0.5955 (0.6502) loss 8.1132 (6.8224) grad_norm 5.4416 (2.7769) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:11:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][400/625] eta 0:02:26 lr 0.000056 wd 0.0500 time 0.5999 (0.6530) data time 0.0010 (0.0048) model time 0.5989 (0.6482) loss 7.4332 (6.8176) grad_norm 2.2166 (2.7991) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:12:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][410/625] eta 0:02:19 lr 0.000056 wd 0.0500 time 0.5929 (0.6510) data time 0.0008 (0.0047) model time 0.5921 (0.6463) loss 6.0816 (6.8064) grad_norm 1.9610 (2.8235) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:12:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][420/625] eta 0:02:13 lr 0.000056 wd 0.0500 time 0.5922 (0.6496) data time 0.0008 (0.0045) model time 0.5913 (0.6450) loss 6.5910 (6.8006) grad_norm 2.4762 (2.8328) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:12:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][430/625] eta 0:02:06 lr 0.000055 wd 0.0500 time 0.6076 (0.6488) data time 0.0011 (0.0044) model time 0.6065 (0.6444) loss 7.8074 (6.8082) grad_norm 2.5381 (2.8402) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:12:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][440/625] eta 0:01:59 lr 0.000055 wd 0.0500 time 0.6083 (0.6474) data time 0.0011 (0.0043) model time 0.6072 (0.6431) loss 6.5423 (6.8057) grad_norm 1.8951 (2.8207) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:12:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][450/625] eta 0:01:53 lr 0.000055 wd 0.0500 time 0.6111 (0.6462) data time 0.0008 (0.0042) model time 0.6103 (0.6420) loss 6.8861 (6.7991) grad_norm 2.8008 (2.8301) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:12:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][460/625] eta 0:01:46 lr 0.000055 wd 0.0500 time 0.6093 (0.6450) data time 0.0008 (0.0041) model time 0.6085 (0.6409) loss 7.0720 (6.8008) grad_norm 2.8991 (2.8232) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:12:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][470/625] eta 0:01:39 lr 0.000055 wd 0.0500 time 0.6000 (0.6437) data time 0.0010 (0.0040) model time 0.5990 (0.6397) loss 8.5009 (6.8185) grad_norm 1.9215 (2.8107) loss_scale 128.0000 (128.0000) mem 22341MB +[2024-07-29 05:12:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][480/625] eta 0:01:33 lr 0.000055 wd 0.0500 time 0.6046 (0.6425) data time 0.0010 (0.0039) model time 0.6035 (0.6386) loss 6.6433 (6.8174) grad_norm 2.4350 (2.9385) loss_scale 256.0000 (131.8438) mem 22341MB +[2024-07-29 05:12:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][490/625] eta 0:01:26 lr 0.000055 wd 0.0500 time 0.5987 (0.6413) data time 0.0011 (0.0038) model time 0.5976 (0.6375) loss 6.7375 (6.8142) grad_norm 2.8814 (2.9588) loss_scale 256.0000 (135.4636) mem 22341MB +[2024-07-29 05:12:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][500/625] eta 0:01:20 lr 0.000055 wd 0.0500 time 0.6036 (0.6402) data time 0.0010 (0.0037) model time 0.6026 (0.6364) loss 7.0072 (6.8129) grad_norm 2.3489 (2.9522) loss_scale 256.0000 (138.8782) mem 22341MB +[2024-07-29 05:13:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][510/625] eta 0:01:13 lr 0.000055 wd 0.0500 time 0.6091 (0.6393) data time 0.0008 (0.0037) model time 0.6083 (0.6357) loss 6.7811 (6.8148) grad_norm 2.1734 (2.9378) loss_scale 256.0000 (142.1047) mem 22341MB +[2024-07-29 05:13:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][520/625] eta 0:01:07 lr 0.000055 wd 0.0500 time 0.6088 (0.6386) data time 0.0008 (0.0036) model time 0.6080 (0.6350) loss 7.4652 (6.8200) grad_norm 3.6898 (2.9323) loss_scale 256.0000 (145.1582) mem 22341MB +[2024-07-29 05:13:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][530/625] eta 0:01:00 lr 0.000055 wd 0.0500 time 0.6049 (0.6378) data time 0.0011 (0.0035) model time 0.6038 (0.6343) loss 6.6940 (6.8178) grad_norm 3.7448 (2.9359) loss_scale 256.0000 (148.0522) mem 22341MB +[2024-07-29 05:13:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][540/625] eta 0:00:54 lr 0.000055 wd 0.0500 time 0.6004 (0.6370) data time 0.0012 (0.0035) model time 0.5992 (0.6335) loss 6.9077 (6.8073) grad_norm 1.9521 (2.9238) loss_scale 256.0000 (150.7990) mem 22341MB +[2024-07-29 05:13:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][550/625] eta 0:00:47 lr 0.000055 wd 0.0500 time 0.6015 (0.6362) data time 0.0011 (0.0034) model time 0.6004 (0.6328) loss 6.1138 (6.8070) grad_norm 2.0578 (2.9086) loss_scale 256.0000 (153.4094) mem 22341MB +[2024-07-29 05:13:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][560/625] eta 0:00:41 lr 0.000055 wd 0.0500 time 0.5982 (0.6354) data time 0.0011 (0.0033) model time 0.5972 (0.6320) loss 7.4619 (6.8118) grad_norm 1.8718 (2.9052) loss_scale 256.0000 (155.8935) mem 22341MB +[2024-07-29 05:13:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][570/625] eta 0:00:34 lr 0.000055 wd 0.0500 time 0.5959 (0.6346) data time 0.0008 (0.0033) model time 0.5951 (0.6313) loss 5.5959 (6.8057) grad_norm 2.5512 (2.8973) loss_scale 256.0000 (158.2600) mem 22341MB +[2024-07-29 05:13:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][580/625] eta 0:00:28 lr 0.000055 wd 0.0500 time 0.6078 (0.6340) data time 0.0008 (0.0032) model time 0.6069 (0.6308) loss 7.5437 (6.8136) grad_norm 2.7131 (2.9096) loss_scale 256.0000 (160.5173) mem 22341MB +[2024-07-29 05:13:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][590/625] eta 0:00:22 lr 0.000055 wd 0.0500 time 0.5896 (0.6335) data time 0.0012 (0.0032) model time 0.5884 (0.6303) loss 6.5191 (6.8151) grad_norm 6.2847 (2.9192) loss_scale 256.0000 (162.6727) mem 22341MB +[2024-07-29 05:13:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][600/625] eta 0:00:15 lr 0.000055 wd 0.0500 time 0.6054 (0.6330) data time 0.0008 (0.0031) model time 0.6046 (0.6298) loss 5.3190 (6.8116) grad_norm 2.7847 (2.9072) loss_scale 256.0000 (164.7329) mem 22341MB +[2024-07-29 05:14:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][610/625] eta 0:00:09 lr 0.000055 wd 0.0500 time 0.5995 (0.6324) data time 0.0005 (0.0031) model time 0.5990 (0.6293) loss 5.8719 (6.8016) grad_norm 2.6071 (2.8968) loss_scale 256.0000 (166.7041) mem 22341MB +[2024-07-29 05:14:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [265/300][620/625] eta 0:00:03 lr 0.000055 wd 0.0500 time 0.5984 (0.6318) data time 0.0005 (0.0031) model time 0.5979 (0.6287) loss 6.9480 (6.7966) grad_norm 2.6362 (2.9103) loss_scale 256.0000 (168.5920) mem 22341MB +[2024-07-29 05:14:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 265 training takes 0:05:01 +[2024-07-29 05:14:12 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 05:14:16 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 05:14:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.509 (0.509) Loss 0.4971 (0.4971) Acc@1 90.234 (90.234) Acc@5 99.023 (99.023) Mem 22341MB +[2024-07-29 05:14:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.127 (0.161) Loss 0.7407 (0.5984) Acc@1 83.301 (88.201) Acc@5 97.119 (98.109) Mem 22341MB +[2024-07-29 05:14:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.145) Loss 0.8315 (0.6868) Acc@1 80.469 (85.489) Acc@5 96.045 (97.338) Mem 22341MB +[2024-07-29 05:14:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.121 Acc@5 97.335 +[2024-07-29 05:14:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.1% +[2024-07-29 05:14:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.886 (0.886) Loss 0.4983 (0.4983) Acc@1 90.332 (90.332) Acc@5 98.975 (98.975) Mem 22341MB +[2024-07-29 05:14:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.195) Loss 0.7363 (0.6034) Acc@1 83.154 (88.179) Acc@5 97.070 (98.171) Mem 22341MB +[2024-07-29 05:14:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.163) Loss 0.8252 (0.6876) Acc@1 80.859 (85.519) Acc@5 96.289 (97.387) Mem 22341MB +[2024-07-29 05:14:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.127 Acc@5 97.393 +[2024-07-29 05:14:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.1% +[2024-07-29 05:14:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.13% +[2024-07-29 05:14:26 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 05:14:29 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 05:14:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][0/625] eta 0:12:31 lr 0.000055 wd 0.0500 time 1.2018 (1.2018) data time 0.4836 (0.4836) model time 0.0000 (0.0000) loss 6.0841 (6.0841) grad_norm 3.3662 (3.3662) loss_scale 256.0000 (256.0000) mem 22337MB +[2024-07-29 05:14:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][10/625] eta 0:06:45 lr 0.000055 wd 0.0500 time 0.6028 (0.6591) data time 0.0011 (0.0449) model time 0.0000 (0.0000) loss 7.1889 (7.0212) grad_norm 2.1973 (2.9077) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:14:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][20/625] eta 0:06:27 lr 0.000055 wd 0.0500 time 0.6019 (0.6411) data time 0.0008 (0.0240) model time 0.0000 (0.0000) loss 6.3186 (6.8427) grad_norm 2.2375 (2.6549) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:14:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][30/625] eta 0:06:14 lr 0.000055 wd 0.0500 time 0.6043 (0.6288) data time 0.0008 (0.0166) model time 0.0000 (0.0000) loss 7.8602 (6.8701) grad_norm 1.7443 (2.6014) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:14:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][40/625] eta 0:06:04 lr 0.000055 wd 0.0500 time 0.6045 (0.6222) data time 0.0010 (0.0128) model time 0.0000 (0.0000) loss 6.1508 (6.8923) grad_norm 3.0999 (2.5686) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:15:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][50/625] eta 0:05:55 lr 0.000055 wd 0.0500 time 0.6045 (0.6182) data time 0.0008 (0.0105) model time 0.0000 (0.0000) loss 6.1472 (6.8209) grad_norm 1.7319 (2.5733) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:15:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][60/625] eta 0:05:47 lr 0.000054 wd 0.0500 time 0.6077 (0.6155) data time 0.0008 (0.0089) model time 0.6070 (0.6005) loss 7.0894 (6.8196) grad_norm 2.8277 (2.5948) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:15:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][70/625] eta 0:05:41 lr 0.000054 wd 0.0500 time 0.6145 (0.6146) data time 0.0008 (0.0078) model time 0.6137 (0.6045) loss 6.8828 (6.8116) grad_norm 2.1360 (2.6028) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:15:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][80/625] eta 0:05:34 lr 0.000054 wd 0.0500 time 0.6015 (0.6136) data time 0.0012 (0.0071) model time 0.6003 (0.6043) loss 7.2009 (6.8134) grad_norm 1.9249 (2.5801) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:15:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][90/625] eta 0:05:27 lr 0.000054 wd 0.0500 time 0.6126 (0.6128) data time 0.0008 (0.0064) model time 0.6118 (0.6046) loss 7.2126 (6.8328) grad_norm 2.9190 (2.6608) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:15:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][100/625] eta 0:05:21 lr 0.000054 wd 0.0500 time 0.6225 (0.6120) data time 0.0011 (0.0059) model time 0.6214 (0.6044) loss 7.2102 (6.8439) grad_norm 2.7566 (2.6668) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:15:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][110/625] eta 0:05:14 lr 0.000054 wd 0.0500 time 0.6072 (0.6113) data time 0.0010 (0.0054) model time 0.6062 (0.6042) loss 7.7191 (6.8535) grad_norm 2.7005 (2.6719) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:15:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][120/625] eta 0:05:08 lr 0.000054 wd 0.0500 time 0.6034 (0.6106) data time 0.0010 (0.0051) model time 0.6024 (0.6039) loss 5.8247 (6.8341) grad_norm 2.4394 (2.6426) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:15:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][130/625] eta 0:05:01 lr 0.000054 wd 0.0500 time 0.6036 (0.6100) data time 0.0010 (0.0048) model time 0.6026 (0.6036) loss 6.3659 (6.8297) grad_norm 1.7929 (2.7108) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:15:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][140/625] eta 0:04:56 lr 0.000054 wd 0.0500 time 0.6062 (0.6106) data time 0.0011 (0.0045) model time 0.6051 (0.6051) loss 6.6923 (6.8423) grad_norm 7.8232 (2.7852) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:16:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][150/625] eta 0:04:50 lr 0.000054 wd 0.0500 time 0.6042 (0.6117) data time 0.0008 (0.0043) model time 0.6034 (0.6073) loss 6.2517 (6.8462) grad_norm 4.9226 (2.7955) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:16:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][160/625] eta 0:04:44 lr 0.000054 wd 0.0500 time 0.6283 (0.6116) data time 0.0017 (0.0041) model time 0.6267 (0.6074) loss 8.0560 (6.8614) grad_norm 3.1304 (2.7715) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:16:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][170/625] eta 0:04:38 lr 0.000054 wd 0.0500 time 0.6035 (0.6113) data time 0.0010 (0.0039) model time 0.6025 (0.6072) loss 7.2181 (6.8405) grad_norm 2.3911 (2.7576) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:16:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][180/625] eta 0:04:32 lr 0.000054 wd 0.0500 time 0.6071 (0.6119) data time 0.0007 (0.0037) model time 0.6064 (0.6083) loss 6.0922 (6.8117) grad_norm 2.7295 (2.7343) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:16:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][190/625] eta 0:04:26 lr 0.000054 wd 0.0500 time 0.6342 (0.6121) data time 0.0010 (0.0036) model time 0.6332 (0.6087) loss 5.4632 (6.8124) grad_norm 1.9212 (2.7284) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:16:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][200/625] eta 0:04:19 lr 0.000054 wd 0.0500 time 0.6019 (0.6116) data time 0.0011 (0.0035) model time 0.6009 (0.6082) loss 7.5129 (6.8128) grad_norm 2.4910 (2.7043) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:16:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][210/625] eta 0:04:13 lr 0.000054 wd 0.0500 time 0.6071 (0.6115) data time 0.0008 (0.0034) model time 0.6064 (0.6082) loss 5.9786 (6.7966) grad_norm 3.2057 (2.7109) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:16:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][220/625] eta 0:04:07 lr 0.000054 wd 0.0500 time 0.6098 (0.6119) data time 0.0009 (0.0033) model time 0.6090 (0.6088) loss 6.9682 (6.7909) grad_norm 2.5327 (2.6943) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:16:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][230/625] eta 0:04:01 lr 0.000054 wd 0.0500 time 0.6111 (0.6116) data time 0.0011 (0.0032) model time 0.6101 (0.6086) loss 6.3206 (6.7828) grad_norm 6.5308 (2.7073) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:16:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][240/625] eta 0:03:55 lr 0.000054 wd 0.0500 time 0.6019 (0.6121) data time 0.0008 (0.0031) model time 0.6011 (0.6093) loss 7.4731 (6.7800) grad_norm 2.3033 (2.7005) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:17:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][250/625] eta 0:03:49 lr 0.000054 wd 0.0500 time 0.5974 (0.6117) data time 0.0011 (0.0030) model time 0.5963 (0.6089) loss 6.0727 (6.7690) grad_norm 3.3599 (2.7230) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:17:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][260/625] eta 0:03:43 lr 0.000054 wd 0.0500 time 0.6020 (0.6115) data time 0.0010 (0.0030) model time 0.6010 (0.6087) loss 7.2204 (6.7892) grad_norm 1.9394 (2.7195) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:17:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][270/625] eta 0:03:37 lr 0.000054 wd 0.0500 time 0.6083 (0.6113) data time 0.0009 (0.0029) model time 0.6074 (0.6086) loss 5.6095 (6.7885) grad_norm 3.3644 (2.7121) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:17:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][280/625] eta 0:03:30 lr 0.000054 wd 0.0500 time 0.6018 (0.6113) data time 0.0008 (0.0028) model time 0.6010 (0.6087) loss 7.5790 (6.7889) grad_norm 3.8086 (2.7113) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:17:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][290/625] eta 0:03:24 lr 0.000054 wd 0.0500 time 0.6193 (0.6112) data time 0.0011 (0.0028) model time 0.6182 (0.6086) loss 6.9155 (6.7924) grad_norm 3.8332 (2.7052) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:17:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][300/625] eta 0:03:18 lr 0.000054 wd 0.0500 time 0.6061 (0.6111) data time 0.0009 (0.0027) model time 0.6052 (0.6086) loss 7.5689 (6.7996) grad_norm 1.7596 (2.7163) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:17:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][310/625] eta 0:03:12 lr 0.000053 wd 0.0500 time 0.6044 (0.6115) data time 0.0008 (0.0027) model time 0.6035 (0.6091) loss 6.5870 (6.7919) grad_norm 2.2493 (2.7144) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:17:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][320/625] eta 0:03:06 lr 0.000053 wd 0.0500 time 0.7059 (0.6117) data time 0.0007 (0.0026) model time 0.7051 (0.6094) loss 6.2505 (6.7905) grad_norm 3.6845 (2.7110) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:17:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][330/625] eta 0:03:00 lr 0.000053 wd 0.0500 time 0.6771 (0.6117) data time 0.0011 (0.0026) model time 0.6760 (0.6094) loss 7.3146 (6.7888) grad_norm 3.8915 (2.7126) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:17:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][340/625] eta 0:02:54 lr 0.000053 wd 0.0500 time 0.6048 (0.6120) data time 0.0011 (0.0026) model time 0.6037 (0.6096) loss 7.0215 (6.7884) grad_norm 3.4984 (2.7313) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:18:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][350/625] eta 0:02:48 lr 0.000053 wd 0.0500 time 0.6064 (0.6119) data time 0.0011 (0.0026) model time 0.6053 (0.6095) loss 7.9667 (6.7791) grad_norm 2.2210 (2.7562) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:18:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][360/625] eta 0:02:42 lr 0.000053 wd 0.0500 time 0.6206 (0.6116) data time 0.0008 (0.0026) model time 0.6198 (0.6093) loss 7.5331 (6.7801) grad_norm 2.4729 (2.7466) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:18:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][370/625] eta 0:02:35 lr 0.000053 wd 0.0500 time 0.6055 (0.6116) data time 0.0008 (0.0025) model time 0.6047 (0.6093) loss 5.3961 (6.7707) grad_norm 1.9401 (2.7433) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:18:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][380/625] eta 0:02:29 lr 0.000053 wd 0.0500 time 0.6009 (0.6115) data time 0.0008 (0.0025) model time 0.6001 (0.6091) loss 5.7485 (6.7721) grad_norm 1.8589 (2.7323) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:18:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][390/625] eta 0:02:23 lr 0.000053 wd 0.0500 time 0.6024 (0.6114) data time 0.0009 (0.0025) model time 0.6016 (0.6091) loss 7.3475 (6.7746) grad_norm 4.0326 (2.7584) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:18:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][400/625] eta 0:02:17 lr 0.000053 wd 0.0500 time 0.6039 (0.6116) data time 0.0011 (0.0024) model time 0.6028 (0.6094) loss 7.2784 (6.7704) grad_norm 2.8147 (2.7927) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:18:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][410/625] eta 0:02:11 lr 0.000053 wd 0.0500 time 0.6004 (0.6114) data time 0.0012 (0.0024) model time 0.5992 (0.6092) loss 7.7221 (6.7698) grad_norm 1.9868 (2.7892) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:18:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][420/625] eta 0:02:05 lr 0.000053 wd 0.0500 time 0.6005 (0.6111) data time 0.0008 (0.0024) model time 0.5997 (0.6089) loss 5.4133 (6.7651) grad_norm 2.5363 (2.7849) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:18:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][430/625] eta 0:01:59 lr 0.000053 wd 0.0500 time 0.6030 (0.6110) data time 0.0012 (0.0023) model time 0.6018 (0.6088) loss 6.6392 (6.7708) grad_norm 4.0351 (2.7775) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:18:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][440/625] eta 0:01:52 lr 0.000053 wd 0.0500 time 0.6057 (0.6108) data time 0.0012 (0.0023) model time 0.6045 (0.6086) loss 6.5973 (6.7729) grad_norm 3.4408 (2.7736) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:19:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][450/625] eta 0:01:46 lr 0.000053 wd 0.0500 time 0.6122 (0.6107) data time 0.0008 (0.0023) model time 0.6113 (0.6085) loss 6.2723 (6.7817) grad_norm 2.3306 (2.7846) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:19:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][460/625] eta 0:01:40 lr 0.000053 wd 0.0500 time 0.8143 (0.6110) data time 0.0009 (0.0022) model time 0.8134 (0.6088) loss 5.9680 (6.7810) grad_norm 1.9524 (2.7876) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:19:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][470/625] eta 0:01:34 lr 0.000053 wd 0.0500 time 0.6019 (0.6108) data time 0.0008 (0.0022) model time 0.6011 (0.6086) loss 6.4542 (6.7740) grad_norm 2.0274 (2.7840) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:19:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][480/625] eta 0:01:28 lr 0.000053 wd 0.0500 time 0.6005 (0.6106) data time 0.0009 (0.0022) model time 0.5996 (0.6085) loss 7.7407 (6.7701) grad_norm 2.6399 (2.7790) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:19:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][490/625] eta 0:01:22 lr 0.000053 wd 0.0500 time 0.5991 (0.6104) data time 0.0010 (0.0022) model time 0.5981 (0.6083) loss 7.7207 (6.7719) grad_norm 1.8539 (2.7966) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:19:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][500/625] eta 0:01:16 lr 0.000053 wd 0.0500 time 0.6059 (0.6102) data time 0.0011 (0.0021) model time 0.6048 (0.6081) loss 5.9626 (6.7750) grad_norm 3.0354 (2.8049) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:19:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][510/625] eta 0:01:10 lr 0.000053 wd 0.0500 time 0.6045 (0.6101) data time 0.0012 (0.0021) model time 0.6033 (0.6080) loss 5.7559 (6.7740) grad_norm 2.2556 (2.8160) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:19:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][520/625] eta 0:01:04 lr 0.000053 wd 0.0500 time 0.6023 (0.6100) data time 0.0011 (0.0021) model time 0.6012 (0.6079) loss 7.5316 (6.7781) grad_norm 2.1056 (2.8065) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:19:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][530/625] eta 0:00:57 lr 0.000053 wd 0.0500 time 0.6056 (0.6099) data time 0.0008 (0.0021) model time 0.6048 (0.6078) loss 7.9761 (6.7791) grad_norm 2.3404 (2.8049) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:19:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][540/625] eta 0:00:51 lr 0.000053 wd 0.0500 time 0.6124 (0.6097) data time 0.0008 (0.0021) model time 0.6116 (0.6077) loss 5.8604 (6.7808) grad_norm 2.1260 (2.8035) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:20:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][550/625] eta 0:00:45 lr 0.000053 wd 0.0500 time 0.6067 (0.6096) data time 0.0011 (0.0020) model time 0.6056 (0.6075) loss 6.5799 (6.7766) grad_norm 2.4085 (2.7937) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:20:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][560/625] eta 0:00:39 lr 0.000053 wd 0.0500 time 0.5999 (0.6095) data time 0.0008 (0.0020) model time 0.5991 (0.6074) loss 7.5515 (6.7781) grad_norm 3.1361 (2.7956) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:20:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][570/625] eta 0:00:33 lr 0.000052 wd 0.0500 time 0.6011 (0.6093) data time 0.0010 (0.0020) model time 0.6001 (0.6073) loss 5.9507 (6.7766) grad_norm 1.8904 (2.7950) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:20:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][580/625] eta 0:00:27 lr 0.000052 wd 0.0500 time 0.6058 (0.6093) data time 0.0008 (0.0020) model time 0.6050 (0.6072) loss 7.5971 (6.7792) grad_norm 2.4336 (2.7904) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:20:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][590/625] eta 0:00:21 lr 0.000052 wd 0.0500 time 0.6003 (0.6092) data time 0.0011 (0.0020) model time 0.5992 (0.6072) loss 7.2791 (6.7795) grad_norm 3.0113 (2.7978) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:20:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][600/625] eta 0:00:15 lr 0.000052 wd 0.0500 time 0.6084 (0.6091) data time 0.0012 (0.0020) model time 0.6072 (0.6071) loss 6.8370 (6.7744) grad_norm 2.2925 (2.7962) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:20:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][610/625] eta 0:00:09 lr 0.000052 wd 0.0500 time 0.6024 (0.6090) data time 0.0006 (0.0020) model time 0.6018 (0.6070) loss 6.5565 (6.7764) grad_norm 1.9346 (2.7907) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:20:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [266/300][620/625] eta 0:00:03 lr 0.000052 wd 0.0500 time 0.6006 (0.6091) data time 0.0005 (0.0019) model time 0.6001 (0.6071) loss 6.3452 (6.7776) grad_norm 1.9437 (2.7856) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:20:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 266 training takes 0:06:20 +[2024-07-29 05:20:50 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 05:20:53 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 05:20:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.527 (0.527) Loss 0.4885 (0.4885) Acc@1 90.576 (90.576) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-29 05:20:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.163) Loss 0.7314 (0.5929) Acc@1 83.057 (88.224) Acc@5 97.119 (98.145) Mem 22339MB +[2024-07-29 05:20:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.145) Loss 0.8154 (0.6803) Acc@1 81.055 (85.561) Acc@5 96.387 (97.394) Mem 22339MB +[2024-07-29 05:20:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.213 Acc@5 97.397 +[2024-07-29 05:20:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.2% +[2024-07-29 05:20:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.843 (0.843) Loss 0.4973 (0.4973) Acc@1 90.430 (90.430) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-29 05:20:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.127 (0.192) Loss 0.7363 (0.6028) Acc@1 83.154 (88.192) Acc@5 97.070 (98.180) Mem 22339MB +[2024-07-29 05:20:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.127 (0.161) Loss 0.8247 (0.6869) Acc@1 80.859 (85.547) Acc@5 96.289 (97.400) Mem 22339MB +[2024-07-29 05:21:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.147 Acc@5 97.407 +[2024-07-29 05:21:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.1% +[2024-07-29 05:21:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.15% +[2024-07-29 05:21:00 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 05:21:03 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 05:21:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][0/625] eta 0:09:55 lr 0.000052 wd 0.0500 time 0.9526 (0.9526) data time 0.4180 (0.4180) model time 0.0000 (0.0000) loss 8.1648 (8.1648) grad_norm 2.3388 (2.3388) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:21:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][10/625] eta 0:06:27 lr 0.000052 wd 0.0500 time 0.6014 (0.6306) data time 0.0008 (0.0389) model time 0.0000 (0.0000) loss 6.5364 (6.9683) grad_norm 1.8845 (2.3030) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:21:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][20/625] eta 0:06:14 lr 0.000052 wd 0.0500 time 0.6016 (0.6185) data time 0.0011 (0.0209) model time 0.0000 (0.0000) loss 6.2607 (6.9068) grad_norm 4.1610 (2.3003) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:21:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][30/625] eta 0:06:05 lr 0.000052 wd 0.0500 time 0.5980 (0.6138) data time 0.0011 (0.0145) model time 0.0000 (0.0000) loss 6.1322 (6.8375) grad_norm 4.4419 (2.5566) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:21:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][40/625] eta 0:05:57 lr 0.000052 wd 0.0500 time 0.6019 (0.6110) data time 0.0009 (0.0112) model time 0.0000 (0.0000) loss 6.7818 (6.8510) grad_norm 4.8738 (2.6453) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:21:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][50/625] eta 0:05:50 lr 0.000052 wd 0.0500 time 0.6012 (0.6090) data time 0.0009 (0.0092) model time 0.0000 (0.0000) loss 6.8900 (6.8172) grad_norm 2.8703 (2.7167) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:21:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][60/625] eta 0:05:46 lr 0.000052 wd 0.0500 time 0.6011 (0.6132) data time 0.0010 (0.0079) model time 0.6001 (0.6338) loss 5.7619 (6.7550) grad_norm 2.3648 (inf) loss_scale 128.0000 (245.5082) mem 22339MB +[2024-07-29 05:21:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][70/625] eta 0:05:39 lr 0.000052 wd 0.0500 time 0.5999 (0.6114) data time 0.0008 (0.0069) model time 0.5990 (0.6164) loss 7.0652 (6.7625) grad_norm 1.7983 (inf) loss_scale 128.0000 (228.9577) mem 22339MB +[2024-07-29 05:21:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][80/625] eta 0:05:32 lr 0.000052 wd 0.0500 time 0.6032 (0.6100) data time 0.0008 (0.0062) model time 0.6024 (0.6108) loss 7.3449 (6.8242) grad_norm 2.0887 (inf) loss_scale 128.0000 (216.4938) mem 22339MB +[2024-07-29 05:21:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][90/625] eta 0:05:25 lr 0.000052 wd 0.0500 time 0.6032 (0.6092) data time 0.0010 (0.0056) model time 0.6022 (0.6085) loss 6.9622 (6.8353) grad_norm 2.4386 (inf) loss_scale 128.0000 (206.7692) mem 22339MB +[2024-07-29 05:22:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][100/625] eta 0:05:19 lr 0.000052 wd 0.0500 time 0.6011 (0.6088) data time 0.0007 (0.0052) model time 0.6003 (0.6076) loss 7.1884 (6.8344) grad_norm 2.2992 (inf) loss_scale 128.0000 (198.9703) mem 22339MB +[2024-07-29 05:22:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][110/625] eta 0:05:13 lr 0.000052 wd 0.0500 time 0.6034 (0.6086) data time 0.0008 (0.0048) model time 0.6025 (0.6073) loss 6.9329 (6.8228) grad_norm 2.0523 (inf) loss_scale 128.0000 (192.5766) mem 22339MB +[2024-07-29 05:22:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][120/625] eta 0:05:07 lr 0.000052 wd 0.0500 time 0.6041 (0.6082) data time 0.0011 (0.0045) model time 0.6031 (0.6066) loss 6.7560 (6.7968) grad_norm 2.3118 (inf) loss_scale 128.0000 (187.2397) mem 22339MB +[2024-07-29 05:22:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][130/625] eta 0:05:00 lr 0.000052 wd 0.0500 time 0.6005 (0.6076) data time 0.0009 (0.0042) model time 0.5997 (0.6058) loss 7.8430 (6.7968) grad_norm 2.0326 (inf) loss_scale 128.0000 (182.7176) mem 22339MB +[2024-07-29 05:22:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][140/625] eta 0:04:54 lr 0.000052 wd 0.0500 time 0.5985 (0.6073) data time 0.0011 (0.0040) model time 0.5974 (0.6053) loss 7.4502 (6.8113) grad_norm 3.5400 (inf) loss_scale 128.0000 (178.8369) mem 22339MB +[2024-07-29 05:22:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][150/625] eta 0:04:48 lr 0.000052 wd 0.0500 time 0.5994 (0.6080) data time 0.0011 (0.0038) model time 0.5983 (0.6064) loss 6.3467 (6.7821) grad_norm 1.9014 (inf) loss_scale 128.0000 (175.4702) mem 22339MB +[2024-07-29 05:22:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][160/625] eta 0:04:42 lr 0.000052 wd 0.0500 time 0.5928 (0.6077) data time 0.0011 (0.0036) model time 0.5917 (0.6061) loss 6.9909 (6.7870) grad_norm 2.3722 (inf) loss_scale 128.0000 (172.5217) mem 22339MB +[2024-07-29 05:22:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][170/625] eta 0:04:36 lr 0.000052 wd 0.0500 time 0.6037 (0.6076) data time 0.0010 (0.0035) model time 0.6027 (0.6060) loss 7.6621 (6.7913) grad_norm 2.7558 (inf) loss_scale 128.0000 (169.9181) mem 22339MB +[2024-07-29 05:22:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][180/625] eta 0:04:30 lr 0.000052 wd 0.0500 time 0.6036 (0.6075) data time 0.0011 (0.0033) model time 0.6025 (0.6059) loss 6.5219 (6.8037) grad_norm 1.7788 (inf) loss_scale 128.0000 (167.6022) mem 22339MB +[2024-07-29 05:22:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][190/625] eta 0:04:24 lr 0.000052 wd 0.0500 time 0.5998 (0.6073) data time 0.0010 (0.0032) model time 0.5987 (0.6056) loss 8.1548 (6.8053) grad_norm 2.5668 (inf) loss_scale 128.0000 (165.5288) mem 22339MB +[2024-07-29 05:23:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][200/625] eta 0:04:17 lr 0.000051 wd 0.0500 time 0.6016 (0.6070) data time 0.0010 (0.0031) model time 0.6006 (0.6053) loss 6.5294 (6.7987) grad_norm 1.9376 (inf) loss_scale 128.0000 (163.6617) mem 22339MB +[2024-07-29 05:23:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][210/625] eta 0:04:11 lr 0.000051 wd 0.0500 time 0.5997 (0.6067) data time 0.0010 (0.0030) model time 0.5987 (0.6050) loss 5.2702 (6.7845) grad_norm 1.8918 (inf) loss_scale 128.0000 (161.9716) mem 22339MB +[2024-07-29 05:23:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][220/625] eta 0:04:05 lr 0.000051 wd 0.0500 time 0.5998 (0.6064) data time 0.0010 (0.0029) model time 0.5987 (0.6046) loss 6.8040 (6.7794) grad_norm 2.2323 (inf) loss_scale 128.0000 (160.4344) mem 22339MB +[2024-07-29 05:23:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][230/625] eta 0:03:59 lr 0.000051 wd 0.0500 time 0.6035 (0.6062) data time 0.0010 (0.0028) model time 0.6025 (0.6044) loss 7.4950 (6.7865) grad_norm 2.2805 (inf) loss_scale 128.0000 (159.0303) mem 22339MB +[2024-07-29 05:23:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][240/625] eta 0:03:53 lr 0.000051 wd 0.0500 time 0.6042 (0.6062) data time 0.0010 (0.0028) model time 0.6032 (0.6044) loss 6.7045 (6.7885) grad_norm 1.9261 (inf) loss_scale 128.0000 (157.7427) mem 22339MB +[2024-07-29 05:23:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][250/625] eta 0:03:47 lr 0.000051 wd 0.0500 time 0.6049 (0.6061) data time 0.0008 (0.0027) model time 0.6041 (0.6044) loss 6.9437 (6.8017) grad_norm 3.2608 (inf) loss_scale 128.0000 (156.5578) mem 22339MB +[2024-07-29 05:23:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][260/625] eta 0:03:41 lr 0.000051 wd 0.0500 time 0.6052 (0.6061) data time 0.0010 (0.0026) model time 0.6043 (0.6043) loss 7.7007 (6.8170) grad_norm 2.2013 (inf) loss_scale 128.0000 (155.4636) mem 22339MB +[2024-07-29 05:23:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][270/625] eta 0:03:35 lr 0.000051 wd 0.0500 time 0.6025 (0.6059) data time 0.0008 (0.0026) model time 0.6017 (0.6042) loss 8.4875 (6.8390) grad_norm 2.7012 (inf) loss_scale 128.0000 (154.4502) mem 22339MB +[2024-07-29 05:23:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][280/625] eta 0:03:29 lr 0.000051 wd 0.0500 time 0.6034 (0.6065) data time 0.0008 (0.0025) model time 0.6026 (0.6049) loss 6.9124 (6.8384) grad_norm 2.3094 (inf) loss_scale 128.0000 (153.5089) mem 22339MB +[2024-07-29 05:23:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][290/625] eta 0:03:23 lr 0.000051 wd 0.0500 time 0.5963 (0.6063) data time 0.0008 (0.0025) model time 0.5955 (0.6047) loss 6.2077 (6.8372) grad_norm 2.6613 (inf) loss_scale 128.0000 (152.6323) mem 22339MB +[2024-07-29 05:24:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][300/625] eta 0:03:16 lr 0.000051 wd 0.0500 time 0.5925 (0.6061) data time 0.0008 (0.0024) model time 0.5917 (0.6045) loss 8.1517 (6.8419) grad_norm 2.3570 (inf) loss_scale 128.0000 (151.8140) mem 22339MB +[2024-07-29 05:24:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][310/625] eta 0:03:10 lr 0.000051 wd 0.0500 time 0.5984 (0.6061) data time 0.0011 (0.0024) model time 0.5974 (0.6045) loss 8.1976 (6.8496) grad_norm 2.6918 (inf) loss_scale 128.0000 (151.0482) mem 22339MB +[2024-07-29 05:24:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][320/625] eta 0:03:04 lr 0.000051 wd 0.0500 time 0.5902 (0.6060) data time 0.0008 (0.0023) model time 0.5894 (0.6044) loss 7.5011 (6.8513) grad_norm 2.2106 (inf) loss_scale 128.0000 (150.3302) mem 22339MB +[2024-07-29 05:24:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][330/625] eta 0:02:58 lr 0.000051 wd 0.0500 time 0.6020 (0.6060) data time 0.0010 (0.0023) model time 0.6010 (0.6044) loss 7.3078 (6.8471) grad_norm 2.7911 (inf) loss_scale 128.0000 (149.6556) mem 22339MB +[2024-07-29 05:24:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][340/625] eta 0:02:52 lr 0.000051 wd 0.0500 time 0.5993 (0.6060) data time 0.0010 (0.0023) model time 0.5983 (0.6044) loss 8.0056 (6.8470) grad_norm 2.0466 (inf) loss_scale 128.0000 (149.0205) mem 22339MB +[2024-07-29 05:24:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][350/625] eta 0:02:46 lr 0.000051 wd 0.0500 time 0.6038 (0.6059) data time 0.0008 (0.0022) model time 0.6030 (0.6043) loss 5.5178 (6.8436) grad_norm 2.5443 (inf) loss_scale 128.0000 (148.4217) mem 22339MB +[2024-07-29 05:24:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][360/625] eta 0:02:40 lr 0.000051 wd 0.0500 time 0.6020 (0.6059) data time 0.0008 (0.0022) model time 0.6011 (0.6043) loss 6.4013 (6.8530) grad_norm 2.9275 (inf) loss_scale 128.0000 (147.8560) mem 22339MB +[2024-07-29 05:24:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][370/625] eta 0:02:34 lr 0.000051 wd 0.0500 time 0.5968 (0.6059) data time 0.0007 (0.0022) model time 0.5961 (0.6044) loss 6.2567 (6.8578) grad_norm 2.3607 (inf) loss_scale 128.0000 (147.3208) mem 22339MB +[2024-07-29 05:24:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][380/625] eta 0:02:28 lr 0.000051 wd 0.0500 time 0.5974 (0.6059) data time 0.0008 (0.0021) model time 0.5965 (0.6043) loss 5.3290 (6.8427) grad_norm 3.1948 (inf) loss_scale 128.0000 (146.8136) mem 22339MB +[2024-07-29 05:24:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][390/625] eta 0:02:22 lr 0.000051 wd 0.0500 time 0.5911 (0.6058) data time 0.0011 (0.0021) model time 0.5900 (0.6043) loss 6.5538 (6.8395) grad_norm 2.4870 (inf) loss_scale 128.0000 (146.3325) mem 22339MB +[2024-07-29 05:25:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][400/625] eta 0:02:16 lr 0.000051 wd 0.0500 time 0.6024 (0.6058) data time 0.0011 (0.0021) model time 0.6013 (0.6043) loss 6.6746 (6.8328) grad_norm 2.1130 (inf) loss_scale 128.0000 (145.8753) mem 22339MB +[2024-07-29 05:25:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][410/625] eta 0:02:10 lr 0.000051 wd 0.0500 time 0.6027 (0.6058) data time 0.0009 (0.0020) model time 0.6017 (0.6043) loss 5.8365 (6.8413) grad_norm 2.6931 (inf) loss_scale 128.0000 (145.4404) mem 22339MB +[2024-07-29 05:25:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][420/625] eta 0:02:04 lr 0.000051 wd 0.0500 time 0.5985 (0.6057) data time 0.0010 (0.0020) model time 0.5975 (0.6042) loss 6.2154 (6.8347) grad_norm 2.0122 (inf) loss_scale 128.0000 (145.0261) mem 22339MB +[2024-07-29 05:25:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][430/625] eta 0:01:58 lr 0.000051 wd 0.0500 time 0.6018 (0.6056) data time 0.0010 (0.0020) model time 0.6008 (0.6041) loss 6.7229 (6.8338) grad_norm 2.1853 (inf) loss_scale 128.0000 (144.6311) mem 22339MB +[2024-07-29 05:25:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][440/625] eta 0:01:52 lr 0.000051 wd 0.0500 time 0.5955 (0.6055) data time 0.0008 (0.0020) model time 0.5947 (0.6040) loss 6.8766 (6.8372) grad_norm 2.6863 (inf) loss_scale 128.0000 (144.2540) mem 22339MB +[2024-07-29 05:25:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][450/625] eta 0:01:45 lr 0.000051 wd 0.0500 time 0.6034 (0.6055) data time 0.0011 (0.0020) model time 0.6024 (0.6040) loss 7.3118 (6.8353) grad_norm 1.9528 (inf) loss_scale 128.0000 (143.8936) mem 22339MB +[2024-07-29 05:25:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][460/625] eta 0:01:39 lr 0.000050 wd 0.0500 time 0.6024 (0.6055) data time 0.0010 (0.0019) model time 0.6014 (0.6040) loss 5.7929 (6.8294) grad_norm 2.3003 (inf) loss_scale 128.0000 (143.5488) mem 22339MB +[2024-07-29 05:25:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][470/625] eta 0:01:33 lr 0.000050 wd 0.0500 time 0.6020 (0.6055) data time 0.0007 (0.0019) model time 0.6013 (0.6040) loss 6.3228 (6.8246) grad_norm 2.3443 (inf) loss_scale 128.0000 (143.2187) mem 22339MB +[2024-07-29 05:25:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][480/625] eta 0:01:27 lr 0.000050 wd 0.0500 time 0.6042 (0.6055) data time 0.0010 (0.0019) model time 0.6032 (0.6040) loss 5.8939 (6.8190) grad_norm 3.5544 (inf) loss_scale 128.0000 (142.9023) mem 22339MB +[2024-07-29 05:26:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][490/625] eta 0:01:21 lr 0.000050 wd 0.0500 time 0.6034 (0.6054) data time 0.0010 (0.0019) model time 0.6024 (0.6040) loss 6.9155 (6.8207) grad_norm 2.3945 (inf) loss_scale 128.0000 (142.5988) mem 22339MB +[2024-07-29 05:26:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][500/625] eta 0:01:15 lr 0.000050 wd 0.0500 time 0.6048 (0.6060) data time 0.0010 (0.0019) model time 0.6038 (0.6047) loss 7.5031 (6.8245) grad_norm 1.8202 (inf) loss_scale 128.0000 (142.3074) mem 22339MB +[2024-07-29 05:26:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][510/625] eta 0:01:09 lr 0.000050 wd 0.0500 time 0.5982 (0.6059) data time 0.0008 (0.0019) model time 0.5974 (0.6046) loss 6.5637 (6.8222) grad_norm 2.0731 (inf) loss_scale 128.0000 (142.0274) mem 22339MB +[2024-07-29 05:26:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][520/625] eta 0:01:03 lr 0.000050 wd 0.0500 time 0.6032 (0.6059) data time 0.0008 (0.0018) model time 0.6023 (0.6045) loss 6.9763 (6.8228) grad_norm 3.7931 (inf) loss_scale 128.0000 (141.7582) mem 22339MB +[2024-07-29 05:26:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][530/625] eta 0:00:57 lr 0.000050 wd 0.0500 time 0.6047 (0.6059) data time 0.0010 (0.0018) model time 0.6038 (0.6045) loss 8.2343 (6.8251) grad_norm 3.2006 (inf) loss_scale 128.0000 (141.4991) mem 22339MB +[2024-07-29 05:26:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][540/625] eta 0:00:51 lr 0.000050 wd 0.0500 time 0.6016 (0.6058) data time 0.0008 (0.0018) model time 0.6008 (0.6045) loss 5.6905 (6.8182) grad_norm 6.0655 (inf) loss_scale 128.0000 (141.2495) mem 22339MB +[2024-07-29 05:26:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][550/625] eta 0:00:45 lr 0.000050 wd 0.0500 time 0.6033 (0.6058) data time 0.0010 (0.0018) model time 0.6023 (0.6044) loss 6.1008 (6.8083) grad_norm 3.0848 (inf) loss_scale 128.0000 (141.0091) mem 22339MB +[2024-07-29 05:26:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][560/625] eta 0:00:39 lr 0.000050 wd 0.0500 time 0.5970 (0.6057) data time 0.0008 (0.0018) model time 0.5962 (0.6044) loss 6.3636 (6.8016) grad_norm 59.9814 (inf) loss_scale 128.0000 (140.7772) mem 22339MB +[2024-07-29 05:26:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][570/625] eta 0:00:33 lr 0.000050 wd 0.0500 time 0.6020 (0.6057) data time 0.0008 (0.0018) model time 0.6013 (0.6043) loss 7.0159 (6.7964) grad_norm 1.9197 (inf) loss_scale 128.0000 (140.5534) mem 22339MB +[2024-07-29 05:26:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][580/625] eta 0:00:27 lr 0.000050 wd 0.0500 time 0.6008 (0.6056) data time 0.0008 (0.0018) model time 0.6000 (0.6043) loss 5.8274 (6.7954) grad_norm 2.2822 (inf) loss_scale 128.0000 (140.3373) mem 22339MB +[2024-07-29 05:27:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][590/625] eta 0:00:21 lr 0.000050 wd 0.0500 time 0.6022 (0.6057) data time 0.0010 (0.0017) model time 0.6012 (0.6043) loss 6.1410 (6.7961) grad_norm 2.3732 (inf) loss_scale 128.0000 (140.1286) mem 22339MB +[2024-07-29 05:27:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][600/625] eta 0:00:15 lr 0.000050 wd 0.0500 time 0.5997 (0.6056) data time 0.0010 (0.0017) model time 0.5986 (0.6043) loss 6.4145 (6.8060) grad_norm 2.2071 (inf) loss_scale 128.0000 (139.9268) mem 22339MB +[2024-07-29 05:27:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][610/625] eta 0:00:09 lr 0.000050 wd 0.0500 time 0.6046 (0.6056) data time 0.0008 (0.0017) model time 0.6039 (0.6043) loss 7.6342 (6.8063) grad_norm 2.5932 (inf) loss_scale 128.0000 (139.7316) mem 22339MB +[2024-07-29 05:27:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [267/300][620/625] eta 0:00:03 lr 0.000050 wd 0.0500 time 0.6041 (0.6056) data time 0.0007 (0.0017) model time 0.6034 (0.6043) loss 7.5886 (6.8039) grad_norm 2.4261 (inf) loss_scale 128.0000 (139.5427) mem 22339MB +[2024-07-29 05:27:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 267 training takes 0:06:18 +[2024-07-29 05:27:21 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 05:27:26 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 05:27:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.517 (0.517) Loss 0.4912 (0.4912) Acc@1 90.186 (90.186) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-29 05:27:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.162) Loss 0.7314 (0.5921) Acc@1 82.764 (88.197) Acc@5 97.168 (98.136) Mem 22339MB +[2024-07-29 05:27:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.145) Loss 0.8120 (0.6770) Acc@1 81.055 (85.614) Acc@5 96.143 (97.380) Mem 22339MB +[2024-07-29 05:27:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.273 Acc@5 97.377 +[2024-07-29 05:27:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.3% +[2024-07-29 05:27:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 85.27% +[2024-07-29 05:27:29 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-29 05:27:31 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-29 05:27:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.530 (0.530) Loss 0.4973 (0.4973) Acc@1 90.430 (90.430) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-29 05:27:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.163) Loss 0.7363 (0.6022) Acc@1 83.154 (88.197) Acc@5 97.070 (98.171) Mem 22339MB +[2024-07-29 05:27:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.127 (0.146) Loss 0.8242 (0.6863) Acc@1 80.859 (85.568) Acc@5 96.289 (97.405) Mem 22339MB +[2024-07-29 05:27:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.177 Acc@5 97.413 +[2024-07-29 05:27:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.2% +[2024-07-29 05:27:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.18% +[2024-07-29 05:27:34 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 05:27:36 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 05:27:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][0/625] eta 0:09:49 lr 0.000050 wd 0.0500 time 0.9425 (0.9425) data time 0.4036 (0.4036) model time 0.0000 (0.0000) loss 6.6784 (6.6784) grad_norm 2.2876 (2.2876) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:27:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][10/625] eta 0:06:28 lr 0.000050 wd 0.0500 time 0.6023 (0.6324) data time 0.0010 (0.0376) model time 0.0000 (0.0000) loss 7.8826 (6.8209) grad_norm 1.8787 (2.4969) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:27:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][20/625] eta 0:06:14 lr 0.000050 wd 0.0500 time 0.5980 (0.6186) data time 0.0008 (0.0202) model time 0.0000 (0.0000) loss 8.0549 (6.7618) grad_norm 7.5143 (2.9424) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:27:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][30/625] eta 0:06:04 lr 0.000050 wd 0.0500 time 0.5982 (0.6126) data time 0.0009 (0.0140) model time 0.0000 (0.0000) loss 6.5794 (6.6976) grad_norm 2.0361 (2.8142) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:28:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][40/625] eta 0:05:57 lr 0.000050 wd 0.0500 time 0.6049 (0.6105) data time 0.0009 (0.0109) model time 0.0000 (0.0000) loss 6.0723 (6.7090) grad_norm 3.4108 (2.9155) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:28:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][50/625] eta 0:05:50 lr 0.000050 wd 0.0500 time 0.5991 (0.6089) data time 0.0011 (0.0090) model time 0.0000 (0.0000) loss 6.1747 (6.7543) grad_norm 5.3367 (2.8871) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:28:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][60/625] eta 0:05:43 lr 0.000050 wd 0.0500 time 0.6023 (0.6084) data time 0.0012 (0.0077) model time 0.6011 (0.6046) loss 6.4953 (6.7140) grad_norm 2.1245 (2.7686) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:28:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][70/625] eta 0:05:37 lr 0.000050 wd 0.0500 time 0.5967 (0.6076) data time 0.0009 (0.0067) model time 0.5958 (0.6032) loss 6.8211 (6.7618) grad_norm 2.3748 (2.7286) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:28:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][80/625] eta 0:05:31 lr 0.000050 wd 0.0500 time 0.7251 (0.6083) data time 0.0012 (0.0060) model time 0.7239 (0.6063) loss 6.6997 (6.7519) grad_norm 1.7423 (2.6875) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:28:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][90/625] eta 0:05:24 lr 0.000050 wd 0.0500 time 0.5994 (0.6070) data time 0.0010 (0.0055) model time 0.5983 (0.6034) loss 6.7401 (6.7249) grad_norm 1.8558 (2.7463) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:28:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][100/625] eta 0:05:20 lr 0.000050 wd 0.0500 time 0.5988 (0.6113) data time 0.0011 (0.0051) model time 0.5977 (0.6127) loss 7.1984 (6.7460) grad_norm 2.3253 (2.7926) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:28:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][110/625] eta 0:05:14 lr 0.000049 wd 0.0500 time 0.6015 (0.6106) data time 0.0009 (0.0047) model time 0.6006 (0.6110) loss 5.9325 (6.7512) grad_norm 7.5582 (2.8406) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:28:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][120/625] eta 0:05:08 lr 0.000049 wd 0.0500 time 0.6056 (0.6101) data time 0.0010 (0.0044) model time 0.6046 (0.6099) loss 6.0122 (6.7314) grad_norm 3.5567 (2.8159) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:28:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][130/625] eta 0:05:01 lr 0.000049 wd 0.0500 time 0.6041 (0.6097) data time 0.0010 (0.0041) model time 0.6031 (0.6091) loss 7.1757 (6.7234) grad_norm 2.9285 (2.8030) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:29:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][140/625] eta 0:04:55 lr 0.000049 wd 0.0500 time 0.5875 (0.6092) data time 0.0011 (0.0039) model time 0.5864 (0.6083) loss 7.4911 (6.7156) grad_norm 2.4376 (2.7750) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:29:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][150/625] eta 0:04:49 lr 0.000049 wd 0.0500 time 0.5992 (0.6087) data time 0.0011 (0.0037) model time 0.5982 (0.6075) loss 6.5747 (6.7226) grad_norm 1.9947 (2.7616) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:29:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][160/625] eta 0:04:42 lr 0.000049 wd 0.0500 time 0.5995 (0.6084) data time 0.0008 (0.0036) model time 0.5986 (0.6071) loss 6.7465 (6.7562) grad_norm 3.8403 (2.7633) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:29:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][170/625] eta 0:04:36 lr 0.000049 wd 0.0500 time 0.5913 (0.6080) data time 0.0012 (0.0034) model time 0.5901 (0.6066) loss 6.4920 (6.7398) grad_norm 1.8832 (2.7388) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:29:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][180/625] eta 0:04:30 lr 0.000049 wd 0.0500 time 0.5991 (0.6077) data time 0.0010 (0.0033) model time 0.5981 (0.6062) loss 7.3314 (6.7453) grad_norm 3.6756 (2.7469) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:29:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][190/625] eta 0:04:24 lr 0.000049 wd 0.0500 time 0.5997 (0.6076) data time 0.0008 (0.0032) model time 0.5989 (0.6061) loss 5.0648 (6.7114) grad_norm 2.5479 (2.7251) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:29:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][200/625] eta 0:04:18 lr 0.000049 wd 0.0500 time 0.6040 (0.6076) data time 0.0011 (0.0031) model time 0.6028 (0.6060) loss 8.4074 (6.7170) grad_norm 2.1287 (2.7163) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:29:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][210/625] eta 0:04:12 lr 0.000049 wd 0.0500 time 0.5879 (0.6074) data time 0.0009 (0.0030) model time 0.5871 (0.6059) loss 6.8519 (6.7235) grad_norm 2.6935 (2.7079) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:29:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][220/625] eta 0:04:05 lr 0.000049 wd 0.0500 time 0.6002 (0.6071) data time 0.0010 (0.0029) model time 0.5991 (0.6055) loss 5.4590 (6.7095) grad_norm 2.0574 (2.6999) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:29:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][230/625] eta 0:03:59 lr 0.000049 wd 0.0500 time 0.6029 (0.6069) data time 0.0010 (0.0028) model time 0.6019 (0.6053) loss 7.8103 (6.7045) grad_norm 3.7793 (2.6971) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:30:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][240/625] eta 0:03:53 lr 0.000049 wd 0.0500 time 0.5870 (0.6070) data time 0.0008 (0.0027) model time 0.5862 (0.6054) loss 6.0283 (6.6938) grad_norm 7.4148 (2.7033) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:30:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][250/625] eta 0:03:47 lr 0.000049 wd 0.0500 time 0.5947 (0.6069) data time 0.0009 (0.0027) model time 0.5938 (0.6054) loss 7.7855 (6.7204) grad_norm 2.5458 (2.7003) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:30:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][260/625] eta 0:03:41 lr 0.000049 wd 0.0500 time 0.5979 (0.6067) data time 0.0008 (0.0026) model time 0.5971 (0.6051) loss 6.9939 (6.7013) grad_norm 2.4717 (2.7078) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:30:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][270/625] eta 0:03:35 lr 0.000049 wd 0.0500 time 0.6004 (0.6073) data time 0.0007 (0.0025) model time 0.5996 (0.6059) loss 5.4900 (6.7058) grad_norm 1.9015 (2.7046) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:30:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][280/625] eta 0:03:29 lr 0.000049 wd 0.0500 time 0.5877 (0.6077) data time 0.0011 (0.0025) model time 0.5866 (0.6064) loss 6.6626 (6.7209) grad_norm 2.5675 (2.7104) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:30:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][290/625] eta 0:03:23 lr 0.000049 wd 0.0500 time 0.5956 (0.6076) data time 0.0010 (0.0024) model time 0.5946 (0.6063) loss 7.2209 (6.7242) grad_norm 2.0855 (2.6983) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:30:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][300/625] eta 0:03:17 lr 0.000049 wd 0.0500 time 0.5992 (0.6076) data time 0.0012 (0.0024) model time 0.5980 (0.6063) loss 7.0184 (6.7354) grad_norm 2.6474 (2.6943) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:30:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][310/625] eta 0:03:11 lr 0.000049 wd 0.0500 time 0.5852 (0.6084) data time 0.0010 (0.0024) model time 0.5842 (0.6072) loss 5.9503 (6.7379) grad_norm 2.5236 (2.6924) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:30:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][320/625] eta 0:03:06 lr 0.000049 wd 0.0500 time 0.5976 (0.6101) data time 0.0008 (0.0023) model time 0.5968 (0.6092) loss 6.7898 (6.7390) grad_norm 1.8141 (2.7003) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:30:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][330/625] eta 0:02:59 lr 0.000049 wd 0.0500 time 0.5899 (0.6100) data time 0.0010 (0.0023) model time 0.5888 (0.6091) loss 6.1904 (6.7380) grad_norm 3.4486 (2.7151) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:31:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][340/625] eta 0:02:53 lr 0.000049 wd 0.0500 time 0.6007 (0.6100) data time 0.0010 (0.0023) model time 0.5997 (0.6090) loss 7.6530 (6.7354) grad_norm 2.2712 (2.7160) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:31:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][350/625] eta 0:02:47 lr 0.000049 wd 0.0500 time 0.5853 (0.6099) data time 0.0012 (0.0023) model time 0.5841 (0.6089) loss 6.5877 (6.7421) grad_norm 2.3494 (2.7092) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:31:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][360/625] eta 0:02:41 lr 0.000049 wd 0.0500 time 0.5959 (0.6098) data time 0.0010 (0.0022) model time 0.5949 (0.6088) loss 8.1429 (6.7521) grad_norm 2.6582 (2.7132) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:31:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][370/625] eta 0:02:35 lr 0.000049 wd 0.0500 time 0.5834 (0.6102) data time 0.0008 (0.0022) model time 0.5826 (0.6092) loss 5.8442 (6.7517) grad_norm 2.0739 (2.7073) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:31:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][380/625] eta 0:02:29 lr 0.000048 wd 0.0500 time 0.5884 (0.6104) data time 0.0009 (0.0022) model time 0.5875 (0.6095) loss 6.2058 (6.7515) grad_norm 2.3920 (2.7084) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:31:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][390/625] eta 0:02:23 lr 0.000048 wd 0.0500 time 0.6016 (0.6104) data time 0.0009 (0.0023) model time 0.6007 (0.6094) loss 6.5653 (6.7586) grad_norm 3.1973 (2.7251) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:31:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][400/625] eta 0:02:17 lr 0.000048 wd 0.0500 time 0.6010 (0.6110) data time 0.0012 (0.0022) model time 0.5998 (0.6100) loss 6.4064 (6.7592) grad_norm 2.9811 (2.7302) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:31:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][410/625] eta 0:02:11 lr 0.000048 wd 0.0500 time 0.6021 (0.6110) data time 0.0011 (0.0022) model time 0.6010 (0.6100) loss 6.5552 (6.7690) grad_norm 2.4892 (2.7521) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:31:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][420/625] eta 0:02:05 lr 0.000048 wd 0.0500 time 0.6030 (0.6108) data time 0.0010 (0.0022) model time 0.6020 (0.6099) loss 7.5901 (6.7764) grad_norm 2.1650 (2.7466) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:32:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][430/625] eta 0:01:59 lr 0.000048 wd 0.0500 time 0.6037 (0.6110) data time 0.0008 (0.0021) model time 0.6029 (0.6101) loss 7.0152 (6.7783) grad_norm 2.2329 (2.7433) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:32:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][440/625] eta 0:01:53 lr 0.000048 wd 0.0500 time 0.5877 (0.6110) data time 0.0011 (0.0021) model time 0.5865 (0.6101) loss 5.5598 (6.7769) grad_norm 1.7506 (2.7316) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:32:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][450/625] eta 0:01:46 lr 0.000048 wd 0.0500 time 0.5871 (0.6108) data time 0.0010 (0.0021) model time 0.5861 (0.6098) loss 7.6656 (6.7764) grad_norm 1.9630 (2.7296) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:32:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][460/625] eta 0:01:40 lr 0.000048 wd 0.0500 time 0.5919 (0.6109) data time 0.0010 (0.0021) model time 0.5908 (0.6100) loss 7.9748 (6.7798) grad_norm 1.9644 (2.7328) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:32:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][470/625] eta 0:01:34 lr 0.000048 wd 0.0500 time 0.5993 (0.6109) data time 0.0011 (0.0021) model time 0.5982 (0.6100) loss 7.1174 (6.7810) grad_norm 2.1107 (2.7269) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:32:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][480/625] eta 0:01:28 lr 0.000048 wd 0.0500 time 0.5913 (0.6108) data time 0.0011 (0.0020) model time 0.5901 (0.6098) loss 6.1235 (6.7782) grad_norm 2.0932 (2.7187) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:32:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][490/625] eta 0:01:22 lr 0.000048 wd 0.0500 time 0.6018 (0.6107) data time 0.0011 (0.0020) model time 0.6006 (0.6097) loss 6.3331 (6.7756) grad_norm 2.1362 (2.7206) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:32:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][500/625] eta 0:01:16 lr 0.000048 wd 0.0500 time 0.5990 (0.6106) data time 0.0007 (0.0020) model time 0.5983 (0.6096) loss 6.9284 (6.7701) grad_norm 2.6237 (2.7124) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:32:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][510/625] eta 0:01:10 lr 0.000048 wd 0.0500 time 0.5938 (0.6105) data time 0.0008 (0.0020) model time 0.5930 (0.6094) loss 5.7085 (6.7624) grad_norm 3.4490 (2.7180) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:32:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][520/625] eta 0:01:04 lr 0.000048 wd 0.0500 time 0.5937 (0.6103) data time 0.0009 (0.0020) model time 0.5928 (0.6093) loss 4.9531 (6.7620) grad_norm 1.8764 (2.7104) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:33:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][530/625] eta 0:00:58 lr 0.000048 wd 0.0500 time 0.7568 (0.6106) data time 0.0008 (0.0020) model time 0.7560 (0.6096) loss 5.9542 (6.7597) grad_norm 2.8237 (2.7031) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:33:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][540/625] eta 0:00:51 lr 0.000048 wd 0.0500 time 0.5955 (0.6110) data time 0.0008 (0.0019) model time 0.5947 (0.6100) loss 6.5930 (6.7590) grad_norm 2.0549 (2.6964) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:33:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][550/625] eta 0:00:45 lr 0.000048 wd 0.0500 time 0.5985 (0.6108) data time 0.0011 (0.0019) model time 0.5973 (0.6098) loss 6.7833 (6.7587) grad_norm 1.7929 (2.7084) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:33:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][560/625] eta 0:00:39 lr 0.000048 wd 0.0500 time 0.6029 (0.6107) data time 0.0011 (0.0019) model time 0.6018 (0.6097) loss 5.9503 (6.7559) grad_norm 3.4079 (2.7223) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:33:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][570/625] eta 0:00:33 lr 0.000048 wd 0.0500 time 0.6011 (0.6106) data time 0.0011 (0.0019) model time 0.6000 (0.6095) loss 6.7735 (6.7561) grad_norm 2.1612 (2.7243) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:33:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][580/625] eta 0:00:27 lr 0.000048 wd 0.0500 time 0.5985 (0.6104) data time 0.0012 (0.0019) model time 0.5973 (0.6094) loss 7.1186 (6.7527) grad_norm 1.9801 (2.7191) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:33:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][590/625] eta 0:00:21 lr 0.000048 wd 0.0500 time 0.6008 (0.6103) data time 0.0009 (0.0019) model time 0.5999 (0.6092) loss 7.3215 (6.7542) grad_norm 1.7333 (2.7111) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:33:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][600/625] eta 0:00:15 lr 0.000048 wd 0.0500 time 0.5987 (0.6102) data time 0.0008 (0.0019) model time 0.5978 (0.6091) loss 5.8998 (6.7581) grad_norm 1.9967 (2.7032) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:33:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][610/625] eta 0:00:09 lr 0.000048 wd 0.0500 time 0.5945 (0.6101) data time 0.0008 (0.0018) model time 0.5938 (0.6090) loss 5.8949 (6.7574) grad_norm 4.1040 (2.6973) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:33:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [268/300][620/625] eta 0:00:03 lr 0.000048 wd 0.0500 time 0.6030 (0.6099) data time 0.0008 (0.0018) model time 0.6022 (0.6088) loss 7.5331 (6.7584) grad_norm 3.0905 (2.6953) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:33:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 268 training takes 0:06:21 +[2024-07-29 05:33:57 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 05:33:59 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 05:34:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.535 (0.535) Loss 0.4902 (0.4902) Acc@1 90.527 (90.527) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-29 05:34:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.164) Loss 0.7319 (0.5927) Acc@1 83.447 (88.175) Acc@5 97.168 (98.167) Mem 22339MB +[2024-07-29 05:34:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.127 (0.146) Loss 0.8105 (0.6781) Acc@1 80.957 (85.589) Acc@5 96.191 (97.396) Mem 22339MB +[2024-07-29 05:34:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.221 Acc@5 97.399 +[2024-07-29 05:34:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.2% +[2024-07-29 05:34:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.856 (0.856) Loss 0.4968 (0.4968) Acc@1 90.381 (90.381) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-29 05:34:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.193) Loss 0.7354 (0.6017) Acc@1 83.154 (88.224) Acc@5 97.070 (98.171) Mem 22339MB +[2024-07-29 05:34:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.162) Loss 0.8232 (0.6859) Acc@1 80.859 (85.572) Acc@5 96.338 (97.407) Mem 22339MB +[2024-07-29 05:34:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.179 Acc@5 97.413 +[2024-07-29 05:34:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.2% +[2024-07-29 05:34:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.18% +[2024-07-29 05:34:06 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 05:34:08 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 05:34:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][0/625] eta 0:10:37 lr 0.000048 wd 0.0500 time 1.0208 (1.0208) data time 0.4825 (0.4825) model time 0.0000 (0.0000) loss 7.8062 (7.8062) grad_norm 2.4204 (2.4204) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:34:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][10/625] eta 0:06:34 lr 0.000048 wd 0.0500 time 0.6000 (0.6411) data time 0.0011 (0.0448) model time 0.0000 (0.0000) loss 7.2697 (6.6108) grad_norm 2.2948 (2.6582) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:34:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][20/625] eta 0:06:16 lr 0.000047 wd 0.0500 time 0.5991 (0.6229) data time 0.0011 (0.0240) model time 0.0000 (0.0000) loss 7.8553 (6.7154) grad_norm 2.0679 (2.7585) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:34:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][30/625] eta 0:06:06 lr 0.000047 wd 0.0500 time 0.5976 (0.6160) data time 0.0011 (0.0166) model time 0.0000 (0.0000) loss 6.6208 (6.7728) grad_norm 2.5217 (2.8555) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:34:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][40/625] eta 0:05:58 lr 0.000047 wd 0.0500 time 0.5904 (0.6125) data time 0.0010 (0.0128) model time 0.0000 (0.0000) loss 8.1251 (6.7595) grad_norm 1.8820 (3.0914) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:34:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][50/625] eta 0:05:50 lr 0.000047 wd 0.0500 time 0.5881 (0.6101) data time 0.0010 (0.0105) model time 0.0000 (0.0000) loss 7.7139 (6.8152) grad_norm 2.3108 (2.9315) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:34:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][60/625] eta 0:05:44 lr 0.000047 wd 0.0500 time 0.5930 (0.6100) data time 0.0008 (0.0089) model time 0.5922 (0.6082) loss 7.7830 (6.7969) grad_norm 2.6222 (2.8549) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:34:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][70/625] eta 0:05:38 lr 0.000047 wd 0.0500 time 0.6041 (0.6094) data time 0.0011 (0.0078) model time 0.6030 (0.6065) loss 7.6792 (6.7920) grad_norm 3.2953 (3.0860) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:34:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][80/625] eta 0:05:31 lr 0.000047 wd 0.0500 time 0.6069 (0.6092) data time 0.0010 (0.0070) model time 0.6059 (0.6065) loss 7.2686 (6.7798) grad_norm 2.6114 (3.0596) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:35:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][90/625] eta 0:05:25 lr 0.000047 wd 0.0500 time 0.6023 (0.6085) data time 0.0011 (0.0063) model time 0.6013 (0.6054) loss 7.5924 (6.8031) grad_norm 2.7816 (3.0693) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:35:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][100/625] eta 0:05:19 lr 0.000047 wd 0.0500 time 0.5998 (0.6079) data time 0.0008 (0.0058) model time 0.5990 (0.6046) loss 7.1232 (6.8293) grad_norm 3.2069 (3.0181) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:35:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][110/625] eta 0:05:12 lr 0.000047 wd 0.0500 time 0.6043 (0.6074) data time 0.0008 (0.0054) model time 0.6035 (0.6039) loss 6.1919 (6.8059) grad_norm 3.1655 (3.0192) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:35:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][120/625] eta 0:05:06 lr 0.000047 wd 0.0500 time 0.5968 (0.6066) data time 0.0010 (0.0050) model time 0.5958 (0.6030) loss 6.8929 (6.8194) grad_norm 1.9029 (2.9789) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:35:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][130/625] eta 0:05:01 lr 0.000047 wd 0.0500 time 0.7481 (0.6084) data time 0.0010 (0.0047) model time 0.7471 (0.6063) loss 8.3862 (6.8203) grad_norm 2.2824 (3.0084) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:35:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][140/625] eta 0:04:55 lr 0.000047 wd 0.0500 time 0.5986 (0.6096) data time 0.0012 (0.0045) model time 0.5975 (0.6083) loss 6.1066 (6.8310) grad_norm 1.8808 (2.9673) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:35:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][150/625] eta 0:04:49 lr 0.000047 wd 0.0500 time 0.6043 (0.6092) data time 0.0009 (0.0042) model time 0.6034 (0.6077) loss 5.8791 (6.8099) grad_norm 2.5102 (2.9281) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:35:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][160/625] eta 0:04:43 lr 0.000047 wd 0.0500 time 0.6064 (0.6089) data time 0.0011 (0.0040) model time 0.6054 (0.6073) loss 6.3513 (6.7955) grad_norm 4.9283 (2.8992) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:35:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][170/625] eta 0:04:36 lr 0.000047 wd 0.0500 time 0.5852 (0.6085) data time 0.0008 (0.0039) model time 0.5843 (0.6067) loss 5.9021 (6.8094) grad_norm 2.4843 (2.8635) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:35:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][180/625] eta 0:04:30 lr 0.000047 wd 0.0500 time 0.5991 (0.6082) data time 0.0008 (0.0037) model time 0.5983 (0.6064) loss 6.9838 (6.8310) grad_norm 1.9024 (2.8438) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:36:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][190/625] eta 0:04:24 lr 0.000047 wd 0.0500 time 0.5997 (0.6079) data time 0.0008 (0.0036) model time 0.5989 (0.6060) loss 8.0136 (6.8150) grad_norm 1.9739 (2.8430) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:36:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][200/625] eta 0:04:18 lr 0.000047 wd 0.0500 time 0.6030 (0.6076) data time 0.0011 (0.0034) model time 0.6019 (0.6057) loss 6.6984 (6.8050) grad_norm 2.2707 (2.8383) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:36:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][210/625] eta 0:04:12 lr 0.000047 wd 0.0500 time 0.6077 (0.6075) data time 0.0010 (0.0033) model time 0.6067 (0.6056) loss 7.1154 (6.8170) grad_norm 3.5803 (2.8308) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:36:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][220/625] eta 0:04:05 lr 0.000047 wd 0.0500 time 0.6021 (0.6074) data time 0.0008 (0.0032) model time 0.6013 (0.6055) loss 6.7847 (6.8330) grad_norm 5.7496 (2.8440) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:36:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][230/625] eta 0:03:59 lr 0.000047 wd 0.0500 time 0.6000 (0.6073) data time 0.0011 (0.0031) model time 0.5989 (0.6054) loss 6.5296 (6.8285) grad_norm 2.0075 (2.8906) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:36:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][240/625] eta 0:03:53 lr 0.000047 wd 0.0500 time 0.5895 (0.6071) data time 0.0008 (0.0030) model time 0.5887 (0.6052) loss 7.1191 (6.8249) grad_norm 2.1980 (2.8964) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:36:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][250/625] eta 0:03:47 lr 0.000047 wd 0.0500 time 0.5888 (0.6068) data time 0.0009 (0.0030) model time 0.5879 (0.6049) loss 6.2492 (6.8236) grad_norm 2.3225 (2.8887) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:36:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][260/625] eta 0:03:41 lr 0.000047 wd 0.0500 time 0.5976 (0.6066) data time 0.0011 (0.0029) model time 0.5965 (0.6047) loss 6.4002 (6.8026) grad_norm 2.7802 (2.8711) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:36:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][270/625] eta 0:03:35 lr 0.000047 wd 0.0500 time 0.6039 (0.6064) data time 0.0011 (0.0028) model time 0.6028 (0.6045) loss 7.9220 (6.8092) grad_norm 2.4278 (2.8765) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:36:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][280/625] eta 0:03:29 lr 0.000047 wd 0.0500 time 0.6040 (0.6068) data time 0.0008 (0.0028) model time 0.6033 (0.6050) loss 7.0866 (6.8040) grad_norm 1.9745 (2.8524) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:37:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][290/625] eta 0:03:23 lr 0.000047 wd 0.0500 time 0.5920 (0.6067) data time 0.0010 (0.0027) model time 0.5910 (0.6049) loss 6.0695 (6.8050) grad_norm 2.8016 (2.8456) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:37:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][300/625] eta 0:03:17 lr 0.000046 wd 0.0500 time 0.6011 (0.6067) data time 0.0009 (0.0027) model time 0.6003 (0.6049) loss 6.3083 (6.7950) grad_norm 3.8116 (2.8366) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:37:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][310/625] eta 0:03:11 lr 0.000046 wd 0.0500 time 0.5894 (0.6066) data time 0.0009 (0.0026) model time 0.5885 (0.6048) loss 7.1964 (6.8184) grad_norm 2.1140 (2.8103) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:37:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][320/625] eta 0:03:04 lr 0.000046 wd 0.0500 time 0.5984 (0.6064) data time 0.0010 (0.0026) model time 0.5974 (0.6047) loss 6.4068 (6.8175) grad_norm 2.9245 (2.7964) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:37:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][330/625] eta 0:02:58 lr 0.000046 wd 0.0500 time 0.6014 (0.6063) data time 0.0011 (0.0025) model time 0.6003 (0.6046) loss 7.3090 (6.8251) grad_norm 2.2747 (2.7934) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:37:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][340/625] eta 0:02:52 lr 0.000046 wd 0.0500 time 0.5932 (0.6062) data time 0.0008 (0.0025) model time 0.5923 (0.6045) loss 5.7832 (6.8312) grad_norm 3.8861 (2.7834) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:37:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][350/625] eta 0:02:46 lr 0.000046 wd 0.0500 time 0.6682 (0.6070) data time 0.0011 (0.0024) model time 0.6671 (0.6054) loss 7.1185 (6.8347) grad_norm 2.5084 (2.7825) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:37:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][360/625] eta 0:02:40 lr 0.000046 wd 0.0500 time 0.5985 (0.6074) data time 0.0008 (0.0024) model time 0.5977 (0.6059) loss 7.6802 (6.8434) grad_norm 3.5875 (2.7787) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:37:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][370/625] eta 0:02:34 lr 0.000046 wd 0.0500 time 0.6005 (0.6074) data time 0.0010 (0.0024) model time 0.5995 (0.6059) loss 7.0146 (6.8415) grad_norm 3.0461 (2.7781) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:38:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][380/625] eta 0:02:28 lr 0.000046 wd 0.0500 time 0.5847 (0.6074) data time 0.0009 (0.0023) model time 0.5838 (0.6059) loss 7.2888 (6.8410) grad_norm 2.5758 (2.7744) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:38:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][390/625] eta 0:02:22 lr 0.000046 wd 0.0500 time 0.6008 (0.6074) data time 0.0010 (0.0023) model time 0.5998 (0.6059) loss 7.3125 (6.8292) grad_norm 3.6599 (2.7637) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:38:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][400/625] eta 0:02:16 lr 0.000046 wd 0.0500 time 0.5920 (0.6073) data time 0.0010 (0.0023) model time 0.5910 (0.6058) loss 5.8652 (6.8248) grad_norm 2.7513 (2.7586) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:38:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][410/625] eta 0:02:10 lr 0.000046 wd 0.0500 time 0.5997 (0.6072) data time 0.0010 (0.0022) model time 0.5988 (0.6057) loss 6.7432 (6.8186) grad_norm 2.7689 (2.7645) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:38:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][420/625] eta 0:02:04 lr 0.000046 wd 0.0500 time 0.6023 (0.6071) data time 0.0008 (0.0022) model time 0.6015 (0.6056) loss 6.9298 (6.8230) grad_norm 1.8478 (2.7602) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:38:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][430/625] eta 0:01:58 lr 0.000046 wd 0.0500 time 0.5970 (0.6072) data time 0.0008 (0.0022) model time 0.5962 (0.6057) loss 7.3711 (6.8147) grad_norm 2.8665 (2.7739) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:38:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][440/625] eta 0:01:52 lr 0.000046 wd 0.0500 time 0.6008 (0.6077) data time 0.0010 (0.0022) model time 0.5998 (0.6063) loss 5.6415 (6.8059) grad_norm 2.1385 (2.7638) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:38:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][450/625] eta 0:01:46 lr 0.000046 wd 0.0500 time 0.5977 (0.6077) data time 0.0011 (0.0022) model time 0.5966 (0.6063) loss 8.1248 (6.8090) grad_norm 2.6675 (2.7552) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:38:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][460/625] eta 0:01:40 lr 0.000046 wd 0.0500 time 0.5958 (0.6083) data time 0.0011 (0.0022) model time 0.5946 (0.6069) loss 8.2289 (6.8063) grad_norm 2.3187 (2.7761) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:38:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][470/625] eta 0:01:34 lr 0.000046 wd 0.0500 time 0.5866 (0.6086) data time 0.0010 (0.0021) model time 0.5856 (0.6073) loss 7.1477 (6.8065) grad_norm 3.5869 (2.7783) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:39:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][480/625] eta 0:01:28 lr 0.000046 wd 0.0500 time 0.6036 (0.6085) data time 0.0008 (0.0021) model time 0.6028 (0.6072) loss 6.4586 (6.7986) grad_norm 3.5987 (2.8262) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:39:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][490/625] eta 0:01:22 lr 0.000046 wd 0.0500 time 0.6034 (0.6087) data time 0.0010 (0.0021) model time 0.6024 (0.6074) loss 5.3757 (6.7960) grad_norm 2.7715 (2.8208) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:39:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][500/625] eta 0:01:16 lr 0.000046 wd 0.0500 time 0.6021 (0.6091) data time 0.0010 (0.0021) model time 0.6011 (0.6079) loss 7.5671 (6.7886) grad_norm 2.6429 (2.8124) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:39:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][510/625] eta 0:01:10 lr 0.000046 wd 0.0500 time 0.6024 (0.6093) data time 0.0008 (0.0021) model time 0.6016 (0.6081) loss 6.2518 (6.7773) grad_norm 2.2946 (2.8059) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:39:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][520/625] eta 0:01:03 lr 0.000046 wd 0.0500 time 0.6049 (0.6092) data time 0.0010 (0.0020) model time 0.6039 (0.6080) loss 6.9145 (6.7758) grad_norm 2.9124 (2.7980) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:39:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][530/625] eta 0:00:57 lr 0.000046 wd 0.0500 time 0.5973 (0.6094) data time 0.0008 (0.0021) model time 0.5965 (0.6081) loss 6.2012 (6.7748) grad_norm 2.8422 (2.8047) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:39:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][540/625] eta 0:00:51 lr 0.000046 wd 0.0500 time 0.6009 (0.6092) data time 0.0011 (0.0021) model time 0.5999 (0.6079) loss 8.6027 (6.7729) grad_norm 1.8455 (2.8357) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:39:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][550/625] eta 0:00:45 lr 0.000046 wd 0.0500 time 0.5967 (0.6091) data time 0.0011 (0.0021) model time 0.5956 (0.6078) loss 5.8224 (6.7700) grad_norm 4.3704 (2.8346) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:39:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][560/625] eta 0:00:39 lr 0.000046 wd 0.0500 time 0.5980 (0.6090) data time 0.0008 (0.0020) model time 0.5972 (0.6076) loss 7.5559 (6.7701) grad_norm 2.4805 (2.8401) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:39:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][570/625] eta 0:00:33 lr 0.000046 wd 0.0500 time 0.7542 (0.6092) data time 0.0009 (0.0020) model time 0.7533 (0.6079) loss 7.4650 (6.7694) grad_norm 1.8445 (2.8285) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:40:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][580/625] eta 0:00:27 lr 0.000045 wd 0.0500 time 0.5925 (0.6094) data time 0.0008 (0.0020) model time 0.5916 (0.6082) loss 7.1163 (6.7677) grad_norm 2.4240 (2.8967) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:40:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][590/625] eta 0:00:21 lr 0.000045 wd 0.0500 time 0.6069 (0.6093) data time 0.0008 (0.0020) model time 0.6061 (0.6081) loss 6.0153 (6.7758) grad_norm 2.5966 (2.8986) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:40:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][600/625] eta 0:00:15 lr 0.000045 wd 0.0500 time 0.5973 (0.6093) data time 0.0008 (0.0020) model time 0.5964 (0.6080) loss 7.7439 (6.7727) grad_norm 2.5534 (2.9044) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:40:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][610/625] eta 0:00:09 lr 0.000045 wd 0.0500 time 0.5993 (0.6091) data time 0.0008 (0.0020) model time 0.5985 (0.6078) loss 6.2583 (6.7695) grad_norm 3.8382 (2.9145) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:40:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [269/300][620/625] eta 0:00:03 lr 0.000045 wd 0.0500 time 0.6077 (0.6090) data time 0.0006 (0.0019) model time 0.6071 (0.6077) loss 6.1074 (6.7744) grad_norm 2.7155 (2.9167) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:40:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 269 training takes 0:06:20 +[2024-07-29 05:40:29 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 05:40:34 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 05:40:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.519 (0.519) Loss 0.4893 (0.4893) Acc@1 90.381 (90.381) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-29 05:40:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.162) Loss 0.7349 (0.5927) Acc@1 83.057 (88.139) Acc@5 97.266 (98.189) Mem 22339MB +[2024-07-29 05:40:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.145) Loss 0.8149 (0.6792) Acc@1 81.104 (85.591) Acc@5 96.191 (97.398) Mem 22339MB +[2024-07-29 05:40:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.239 Acc@5 97.399 +[2024-07-29 05:40:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.2% +[2024-07-29 05:40:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.820 (0.820) Loss 0.4968 (0.4968) Acc@1 90.381 (90.381) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-29 05:40:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.192) Loss 0.7349 (0.6011) Acc@1 83.154 (88.219) Acc@5 97.070 (98.171) Mem 22339MB +[2024-07-29 05:40:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.161) Loss 0.8228 (0.6853) Acc@1 81.006 (85.589) Acc@5 96.387 (97.407) Mem 22339MB +[2024-07-29 05:40:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.199 Acc@5 97.413 +[2024-07-29 05:40:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.2% +[2024-07-29 05:40:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.20% +[2024-07-29 05:40:41 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 05:40:44 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 05:40:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][0/625] eta 0:10:36 lr 0.000045 wd 0.0500 time 1.0186 (1.0186) data time 0.4834 (0.4834) model time 0.0000 (0.0000) loss 6.4726 (6.4726) grad_norm 1.8639 (1.8639) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:40:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][10/625] eta 0:06:33 lr 0.000045 wd 0.0500 time 0.6040 (0.6402) data time 0.0010 (0.0449) model time 0.0000 (0.0000) loss 6.6911 (6.3261) grad_norm 2.8049 (2.7057) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:40:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][20/625] eta 0:06:16 lr 0.000045 wd 0.0500 time 0.6030 (0.6231) data time 0.0010 (0.0240) model time 0.0000 (0.0000) loss 7.0796 (6.7237) grad_norm 2.4757 (2.6945) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:41:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][30/625] eta 0:06:10 lr 0.000045 wd 0.0500 time 0.6043 (0.6221) data time 0.0007 (0.0166) model time 0.0000 (0.0000) loss 5.7318 (6.6134) grad_norm 2.0049 (2.5805) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:41:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][40/625] eta 0:06:00 lr 0.000045 wd 0.0500 time 0.5975 (0.6171) data time 0.0009 (0.0128) model time 0.0000 (0.0000) loss 6.4080 (6.6342) grad_norm 2.0374 (2.5627) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:41:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][50/625] eta 0:05:52 lr 0.000045 wd 0.0500 time 0.5992 (0.6139) data time 0.0010 (0.0105) model time 0.0000 (0.0000) loss 6.9249 (6.5800) grad_norm 6.9575 (2.6623) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:41:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][60/625] eta 0:05:45 lr 0.000045 wd 0.0500 time 0.6027 (0.6120) data time 0.0012 (0.0089) model time 0.6016 (0.6016) loss 7.0045 (6.6115) grad_norm 1.8820 (2.6762) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:41:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][70/625] eta 0:05:38 lr 0.000045 wd 0.0500 time 0.6006 (0.6106) data time 0.0010 (0.0078) model time 0.5995 (0.6011) loss 7.3917 (6.6091) grad_norm 2.3169 (2.7342) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:41:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][80/625] eta 0:05:32 lr 0.000045 wd 0.0500 time 0.6019 (0.6099) data time 0.0009 (0.0070) model time 0.6010 (0.6021) loss 7.2529 (6.6422) grad_norm 2.6130 (2.7431) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:41:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][90/625] eta 0:05:25 lr 0.000045 wd 0.0500 time 0.6035 (0.6093) data time 0.0011 (0.0064) model time 0.6024 (0.6023) loss 6.3387 (6.6106) grad_norm 1.5557 (2.6993) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:41:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][100/625] eta 0:05:19 lr 0.000045 wd 0.0500 time 0.6055 (0.6088) data time 0.0011 (0.0059) model time 0.6044 (0.6025) loss 6.9047 (6.6397) grad_norm 2.3266 (2.6734) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:41:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][110/625] eta 0:05:13 lr 0.000045 wd 0.0500 time 0.6016 (0.6084) data time 0.0011 (0.0054) model time 0.6005 (0.6027) loss 6.6887 (6.6231) grad_norm 3.0173 (2.6641) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:41:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][120/625] eta 0:05:07 lr 0.000045 wd 0.0500 time 0.6012 (0.6079) data time 0.0010 (0.0051) model time 0.6002 (0.6025) loss 6.8324 (6.6623) grad_norm 3.9138 (2.6374) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:42:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][130/625] eta 0:05:00 lr 0.000045 wd 0.0500 time 0.5923 (0.6072) data time 0.0010 (0.0048) model time 0.5913 (0.6018) loss 7.7139 (6.6548) grad_norm 1.6009 (2.6160) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:42:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][140/625] eta 0:04:54 lr 0.000045 wd 0.0500 time 0.5968 (0.6068) data time 0.0011 (0.0045) model time 0.5957 (0.6017) loss 7.2640 (6.6661) grad_norm 4.3316 (2.6139) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:42:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][150/625] eta 0:04:48 lr 0.000045 wd 0.0500 time 0.6032 (0.6065) data time 0.0009 (0.0043) model time 0.6024 (0.6016) loss 6.3502 (6.6664) grad_norm 1.9179 (2.7226) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:42:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][160/625] eta 0:04:41 lr 0.000045 wd 0.0500 time 0.5987 (0.6063) data time 0.0008 (0.0041) model time 0.5979 (0.6018) loss 6.2929 (6.6677) grad_norm 2.6019 (2.7081) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:42:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][170/625] eta 0:04:37 lr 0.000045 wd 0.0500 time 0.8114 (0.6088) data time 0.0010 (0.0039) model time 0.8103 (0.6056) loss 7.7596 (6.6702) grad_norm 2.2678 (2.7021) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:42:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][180/625] eta 0:04:30 lr 0.000045 wd 0.0500 time 0.5979 (0.6086) data time 0.0010 (0.0037) model time 0.5968 (0.6054) loss 6.5655 (6.6780) grad_norm 2.2517 (2.7091) loss_scale 128.0000 (128.0000) mem 22339MB +[2024-07-29 05:42:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][190/625] eta 0:04:24 lr 0.000045 wd 0.0500 time 0.5990 (0.6083) data time 0.0009 (0.0036) model time 0.5981 (0.6052) loss 6.4003 (6.6760) grad_norm 2.8102 (2.6900) loss_scale 256.0000 (134.7016) mem 22339MB +[2024-07-29 05:42:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][200/625] eta 0:04:18 lr 0.000045 wd 0.0500 time 0.6023 (0.6079) data time 0.0010 (0.0035) model time 0.6013 (0.6049) loss 7.8513 (6.6835) grad_norm 3.4660 (2.6758) loss_scale 256.0000 (140.7363) mem 22339MB +[2024-07-29 05:42:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][210/625] eta 0:04:12 lr 0.000045 wd 0.0500 time 0.5994 (0.6077) data time 0.0010 (0.0033) model time 0.5984 (0.6047) loss 6.6796 (6.6819) grad_norm 8.5165 (2.6949) loss_scale 256.0000 (146.1991) mem 22339MB +[2024-07-29 05:42:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][220/625] eta 0:04:05 lr 0.000045 wd 0.0500 time 0.6023 (0.6073) data time 0.0010 (0.0032) model time 0.6012 (0.6043) loss 6.3390 (6.6839) grad_norm 2.6721 (2.7098) loss_scale 256.0000 (151.1674) mem 22339MB +[2024-07-29 05:43:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][230/625] eta 0:03:59 lr 0.000045 wd 0.0500 time 0.6014 (0.6071) data time 0.0010 (0.0031) model time 0.6004 (0.6042) loss 5.5505 (6.6964) grad_norm 1.7092 (2.6937) loss_scale 256.0000 (155.7056) mem 22339MB +[2024-07-29 05:43:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][240/625] eta 0:03:53 lr 0.000044 wd 0.0500 time 0.6018 (0.6071) data time 0.0011 (0.0031) model time 0.6007 (0.6042) loss 6.2274 (6.6932) grad_norm 2.4379 (2.7062) loss_scale 256.0000 (159.8672) mem 22339MB +[2024-07-29 05:43:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][250/625] eta 0:03:47 lr 0.000044 wd 0.0500 time 0.7266 (0.6074) data time 0.0008 (0.0030) model time 0.7258 (0.6047) loss 5.4053 (6.7023) grad_norm 3.0544 (2.6884) loss_scale 256.0000 (163.6972) mem 22339MB +[2024-07-29 05:43:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][260/625] eta 0:03:41 lr 0.000044 wd 0.0500 time 0.5998 (0.6073) data time 0.0011 (0.0029) model time 0.5987 (0.6046) loss 8.2756 (6.7312) grad_norm 2.1839 (2.7033) loss_scale 256.0000 (167.2337) mem 22339MB +[2024-07-29 05:43:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][270/625] eta 0:03:35 lr 0.000044 wd 0.0500 time 0.5969 (0.6071) data time 0.0008 (0.0028) model time 0.5962 (0.6044) loss 7.2633 (6.7366) grad_norm 2.4158 (2.7768) loss_scale 256.0000 (170.5092) mem 22339MB +[2024-07-29 05:43:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][280/625] eta 0:03:29 lr 0.000044 wd 0.0500 time 0.5856 (0.6071) data time 0.0008 (0.0028) model time 0.5847 (0.6045) loss 6.8722 (6.7348) grad_norm 3.5620 (2.7842) loss_scale 256.0000 (173.5516) mem 22339MB +[2024-07-29 05:43:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][290/625] eta 0:03:23 lr 0.000044 wd 0.0500 time 0.5998 (0.6071) data time 0.0008 (0.0027) model time 0.5990 (0.6046) loss 6.3280 (6.7286) grad_norm 2.0369 (2.7832) loss_scale 256.0000 (176.3849) mem 22339MB +[2024-07-29 05:43:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][300/625] eta 0:03:17 lr 0.000044 wd 0.0500 time 0.5899 (0.6071) data time 0.0008 (0.0026) model time 0.5892 (0.6047) loss 6.5351 (6.7276) grad_norm 11.9709 (2.7946) loss_scale 256.0000 (179.0299) mem 22339MB +[2024-07-29 05:43:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][310/625] eta 0:03:11 lr 0.000044 wd 0.0500 time 0.6020 (0.6071) data time 0.0010 (0.0026) model time 0.6010 (0.6047) loss 6.9755 (6.7223) grad_norm 1.9989 (2.7813) loss_scale 256.0000 (181.5048) mem 22339MB +[2024-07-29 05:43:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][320/625] eta 0:03:05 lr 0.000044 wd 0.0500 time 0.6012 (0.6071) data time 0.0011 (0.0025) model time 0.6001 (0.6047) loss 7.6526 (6.7365) grad_norm 2.1531 (2.7977) loss_scale 256.0000 (183.8255) mem 22339MB +[2024-07-29 05:44:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][330/625] eta 0:02:59 lr 0.000044 wd 0.0500 time 0.6031 (0.6069) data time 0.0008 (0.0025) model time 0.6023 (0.6046) loss 6.6353 (6.7384) grad_norm 2.7420 (2.7830) loss_scale 256.0000 (186.0060) mem 22339MB +[2024-07-29 05:44:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][340/625] eta 0:02:53 lr 0.000044 wd 0.0500 time 0.6019 (0.6076) data time 0.0007 (0.0025) model time 0.6011 (0.6055) loss 7.2523 (6.7377) grad_norm 2.2944 (2.7963) loss_scale 256.0000 (188.0587) mem 22339MB +[2024-07-29 05:44:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][350/625] eta 0:02:47 lr 0.000044 wd 0.0500 time 0.5858 (0.6081) data time 0.0008 (0.0024) model time 0.5849 (0.6061) loss 6.1198 (6.7281) grad_norm 2.3548 (2.7918) loss_scale 256.0000 (189.9943) mem 22339MB +[2024-07-29 05:44:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][360/625] eta 0:02:41 lr 0.000044 wd 0.0500 time 0.6011 (0.6079) data time 0.0009 (0.0024) model time 0.6002 (0.6059) loss 7.0614 (6.7246) grad_norm 3.7297 (2.8126) loss_scale 256.0000 (191.8227) mem 22339MB +[2024-07-29 05:44:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][370/625] eta 0:02:35 lr 0.000044 wd 0.0500 time 0.6026 (0.6082) data time 0.0010 (0.0023) model time 0.6015 (0.6062) loss 5.6585 (6.7252) grad_norm 1.5011 (2.8181) loss_scale 256.0000 (193.5526) mem 22339MB +[2024-07-29 05:44:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][380/625] eta 0:02:29 lr 0.000044 wd 0.0500 time 0.5871 (0.6082) data time 0.0008 (0.0023) model time 0.5864 (0.6063) loss 7.0481 (6.7334) grad_norm 2.9397 (2.8124) loss_scale 256.0000 (195.1916) mem 22339MB +[2024-07-29 05:44:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][390/625] eta 0:02:23 lr 0.000044 wd 0.0500 time 0.8018 (0.6098) data time 0.0010 (0.0023) model time 0.8008 (0.6082) loss 6.4740 (6.7296) grad_norm 2.0822 (2.8301) loss_scale 256.0000 (196.7468) mem 22339MB +[2024-07-29 05:44:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][400/625] eta 0:02:17 lr 0.000044 wd 0.0500 time 0.5977 (0.6098) data time 0.0008 (0.0022) model time 0.5969 (0.6082) loss 6.8611 (6.7312) grad_norm 1.9378 (2.8522) loss_scale 256.0000 (198.2244) mem 22339MB +[2024-07-29 05:44:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][410/625] eta 0:02:11 lr 0.000044 wd 0.0500 time 0.5981 (0.6099) data time 0.0010 (0.0022) model time 0.5970 (0.6083) loss 7.1052 (6.7418) grad_norm 3.2040 (2.8555) loss_scale 256.0000 (199.6302) mem 22339MB +[2024-07-29 05:45:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][420/625] eta 0:02:05 lr 0.000044 wd 0.0500 time 0.5872 (0.6099) data time 0.0010 (0.0022) model time 0.5862 (0.6083) loss 6.4167 (6.7383) grad_norm 1.9948 (2.8510) loss_scale 256.0000 (200.9691) mem 22339MB +[2024-07-29 05:45:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][430/625] eta 0:01:58 lr 0.000044 wd 0.0500 time 0.5844 (0.6098) data time 0.0011 (0.0022) model time 0.5833 (0.6083) loss 6.7204 (6.7435) grad_norm 1.8775 (2.8375) loss_scale 256.0000 (202.2459) mem 22339MB +[2024-07-29 05:45:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][440/625] eta 0:01:52 lr 0.000044 wd 0.0500 time 0.6031 (0.6097) data time 0.0011 (0.0021) model time 0.6021 (0.6081) loss 5.9723 (6.7425) grad_norm 2.0347 (2.8273) loss_scale 256.0000 (203.4649) mem 22339MB +[2024-07-29 05:45:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][450/625] eta 0:01:46 lr 0.000044 wd 0.0500 time 0.6030 (0.6097) data time 0.0008 (0.0021) model time 0.6022 (0.6082) loss 5.8894 (6.7394) grad_norm 2.3691 (2.8191) loss_scale 256.0000 (204.6297) mem 22339MB +[2024-07-29 05:45:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][460/625] eta 0:01:40 lr 0.000044 wd 0.0500 time 0.5960 (0.6097) data time 0.0009 (0.0021) model time 0.5951 (0.6081) loss 5.2439 (6.7340) grad_norm 2.5074 (2.8166) loss_scale 256.0000 (205.7440) mem 22339MB +[2024-07-29 05:45:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][470/625] eta 0:01:34 lr 0.000044 wd 0.0500 time 0.5963 (0.6099) data time 0.0009 (0.0021) model time 0.5954 (0.6083) loss 6.9487 (6.7307) grad_norm 3.6443 (2.8205) loss_scale 256.0000 (206.8110) mem 22339MB +[2024-07-29 05:45:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][480/625] eta 0:01:28 lr 0.000044 wd 0.0500 time 0.5988 (0.6102) data time 0.0008 (0.0021) model time 0.5980 (0.6087) loss 7.4481 (6.7344) grad_norm 2.5894 (2.8223) loss_scale 256.0000 (207.8337) mem 22339MB +[2024-07-29 05:45:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][490/625] eta 0:01:22 lr 0.000044 wd 0.0500 time 0.5968 (0.6100) data time 0.0008 (0.0020) model time 0.5959 (0.6085) loss 5.6710 (6.7287) grad_norm 2.9601 (2.8183) loss_scale 256.0000 (208.8147) mem 22339MB +[2024-07-29 05:45:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][500/625] eta 0:01:16 lr 0.000044 wd 0.0500 time 0.6042 (0.6103) data time 0.0008 (0.0020) model time 0.6034 (0.6089) loss 7.6784 (6.7332) grad_norm 1.7087 (2.8098) loss_scale 256.0000 (209.7565) mem 22339MB +[2024-07-29 05:45:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][510/625] eta 0:01:10 lr 0.000044 wd 0.0500 time 0.6000 (0.6106) data time 0.0009 (0.0020) model time 0.5991 (0.6092) loss 7.6594 (6.7356) grad_norm 2.4187 (2.8290) loss_scale 256.0000 (210.6614) mem 22339MB +[2024-07-29 05:46:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][520/625] eta 0:01:04 lr 0.000044 wd 0.0500 time 0.6040 (0.6106) data time 0.0010 (0.0020) model time 0.6030 (0.6092) loss 7.5677 (6.7310) grad_norm 1.9217 (2.8184) loss_scale 256.0000 (211.5317) mem 22339MB +[2024-07-29 05:46:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][530/625] eta 0:00:57 lr 0.000043 wd 0.0500 time 0.6032 (0.6105) data time 0.0010 (0.0020) model time 0.6022 (0.6091) loss 6.6175 (6.7320) grad_norm 1.9384 (2.8106) loss_scale 256.0000 (212.3691) mem 22339MB +[2024-07-29 05:46:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][540/625] eta 0:00:51 lr 0.000043 wd 0.0500 time 0.5971 (0.6106) data time 0.0008 (0.0019) model time 0.5963 (0.6092) loss 6.1126 (6.7284) grad_norm 1.8674 (2.8078) loss_scale 256.0000 (213.1756) mem 22339MB +[2024-07-29 05:46:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][550/625] eta 0:00:45 lr 0.000043 wd 0.0500 time 0.5840 (0.6107) data time 0.0011 (0.0019) model time 0.5829 (0.6094) loss 6.4078 (6.7274) grad_norm 2.2534 (2.8038) loss_scale 256.0000 (213.9528) mem 22339MB +[2024-07-29 05:46:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][560/625] eta 0:00:39 lr 0.000043 wd 0.0500 time 0.5974 (0.6106) data time 0.0011 (0.0019) model time 0.5964 (0.6092) loss 5.9894 (6.7273) grad_norm 2.1631 (2.7987) loss_scale 256.0000 (214.7023) mem 22339MB +[2024-07-29 05:46:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][570/625] eta 0:00:33 lr 0.000043 wd 0.0500 time 0.5948 (0.6106) data time 0.0008 (0.0019) model time 0.5940 (0.6092) loss 7.3777 (6.7218) grad_norm 5.1899 (2.7973) loss_scale 256.0000 (215.4256) mem 22339MB +[2024-07-29 05:46:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][580/625] eta 0:00:27 lr 0.000043 wd 0.0500 time 0.6000 (0.6104) data time 0.0008 (0.0019) model time 0.5992 (0.6090) loss 6.9993 (6.7228) grad_norm 5.2818 (2.8076) loss_scale 256.0000 (216.1239) mem 22339MB +[2024-07-29 05:46:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][590/625] eta 0:00:21 lr 0.000043 wd 0.0500 time 0.6003 (0.6103) data time 0.0011 (0.0019) model time 0.5992 (0.6089) loss 7.4554 (6.7297) grad_norm 2.6861 (2.8071) loss_scale 256.0000 (216.7986) mem 22339MB +[2024-07-29 05:46:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][600/625] eta 0:00:15 lr 0.000043 wd 0.0500 time 0.6062 (0.6102) data time 0.0009 (0.0019) model time 0.6054 (0.6088) loss 7.8574 (6.7311) grad_norm 2.8296 (2.8046) loss_scale 256.0000 (217.4509) mem 22339MB +[2024-07-29 05:46:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][610/625] eta 0:00:09 lr 0.000043 wd 0.0500 time 0.5935 (0.6109) data time 0.0008 (0.0019) model time 0.5928 (0.6095) loss 6.7745 (6.7361) grad_norm 2.8708 (2.8012) loss_scale 256.0000 (218.0818) mem 22339MB +[2024-07-29 05:47:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [270/300][620/625] eta 0:00:03 lr 0.000043 wd 0.0500 time 0.6022 (0.6108) data time 0.0008 (0.0018) model time 0.6015 (0.6094) loss 6.1317 (6.7321) grad_norm 3.4739 (2.7973) loss_scale 256.0000 (218.6924) mem 22339MB +[2024-07-29 05:47:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 270 training takes 0:06:21 +[2024-07-29 05:47:06 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 05:47:11 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 05:47:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.522 (0.522) Loss 0.4912 (0.4912) Acc@1 90.381 (90.381) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-29 05:47:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.162) Loss 0.7334 (0.5944) Acc@1 83.154 (88.117) Acc@5 97.266 (98.198) Mem 22339MB +[2024-07-29 05:47:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.145) Loss 0.8149 (0.6792) Acc@1 80.811 (85.579) Acc@5 96.387 (97.449) Mem 22339MB +[2024-07-29 05:47:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.257 Acc@5 97.439 +[2024-07-29 05:47:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.3% +[2024-07-29 05:47:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.842 (0.842) Loss 0.4961 (0.4961) Acc@1 90.332 (90.332) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-29 05:47:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.127 (0.191) Loss 0.7349 (0.6005) Acc@1 83.105 (88.224) Acc@5 97.070 (98.176) Mem 22339MB +[2024-07-29 05:47:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.160) Loss 0.8213 (0.6847) Acc@1 81.055 (85.603) Acc@5 96.387 (97.414) Mem 22339MB +[2024-07-29 05:47:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.201 Acc@5 97.419 +[2024-07-29 05:47:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.2% +[2024-07-29 05:47:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.20% +[2024-07-29 05:47:18 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 05:47:20 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 05:47:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][0/625] eta 0:09:31 lr 0.000043 wd 0.0500 time 0.9141 (0.9141) data time 0.3791 (0.3791) model time 0.0000 (0.0000) loss 5.3717 (5.3717) grad_norm 8.4593 (8.4593) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:47:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][10/625] eta 0:06:27 lr 0.000043 wd 0.0500 time 0.6013 (0.6295) data time 0.0011 (0.0353) model time 0.0000 (0.0000) loss 7.0091 (6.8121) grad_norm 2.8787 (3.8430) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:47:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][20/625] eta 0:06:11 lr 0.000043 wd 0.0500 time 0.5949 (0.6144) data time 0.0008 (0.0190) model time 0.0000 (0.0000) loss 6.9191 (7.0184) grad_norm 3.1347 (3.3973) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:47:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][30/625] eta 0:06:03 lr 0.000043 wd 0.0500 time 0.6028 (0.6103) data time 0.0010 (0.0132) model time 0.0000 (0.0000) loss 6.9053 (7.0264) grad_norm 3.6797 (3.1647) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:47:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][40/625] eta 0:05:56 lr 0.000043 wd 0.0500 time 0.5997 (0.6087) data time 0.0008 (0.0102) model time 0.0000 (0.0000) loss 7.7880 (6.9789) grad_norm 2.7073 (3.0951) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:47:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][50/625] eta 0:05:49 lr 0.000043 wd 0.0500 time 0.6035 (0.6077) data time 0.0010 (0.0084) model time 0.0000 (0.0000) loss 6.9436 (6.9960) grad_norm 3.6930 (3.4502) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:47:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][60/625] eta 0:05:43 lr 0.000043 wd 0.0500 time 0.5985 (0.6071) data time 0.0008 (0.0072) model time 0.5977 (0.6032) loss 7.1892 (7.0156) grad_norm 1.7554 (3.2630) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:48:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][70/625] eta 0:05:36 lr 0.000043 wd 0.0500 time 0.5989 (0.6059) data time 0.0011 (0.0064) model time 0.5978 (0.6003) loss 6.1967 (6.9642) grad_norm 9.3866 (3.2868) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:48:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][80/625] eta 0:05:29 lr 0.000043 wd 0.0500 time 0.6013 (0.6055) data time 0.0010 (0.0057) model time 0.6003 (0.6006) loss 7.0027 (6.9695) grad_norm 2.4044 (3.2003) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:48:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][90/625] eta 0:05:23 lr 0.000043 wd 0.0500 time 0.5998 (0.6050) data time 0.0010 (0.0052) model time 0.5988 (0.6005) loss 6.3509 (6.9506) grad_norm 2.1339 (3.1222) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:48:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][100/625] eta 0:05:17 lr 0.000043 wd 0.0500 time 0.6021 (0.6046) data time 0.0008 (0.0048) model time 0.6014 (0.6004) loss 7.1213 (6.9150) grad_norm 2.3997 (3.3366) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:48:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][110/625] eta 0:05:11 lr 0.000043 wd 0.0500 time 0.6047 (0.6047) data time 0.0008 (0.0044) model time 0.6039 (0.6012) loss 7.3175 (6.8752) grad_norm 1.8675 (3.2395) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:48:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][120/625] eta 0:05:05 lr 0.000043 wd 0.0500 time 0.6014 (0.6047) data time 0.0011 (0.0042) model time 0.6002 (0.6014) loss 8.3340 (6.8718) grad_norm 3.2740 (3.5659) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:48:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][130/625] eta 0:04:59 lr 0.000043 wd 0.0500 time 0.6029 (0.6047) data time 0.0010 (0.0039) model time 0.6019 (0.6018) loss 6.3320 (6.8601) grad_norm 2.1703 (3.4668) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:48:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][140/625] eta 0:04:53 lr 0.000043 wd 0.0500 time 0.6018 (0.6047) data time 0.0010 (0.0037) model time 0.6008 (0.6019) loss 6.9882 (6.8584) grad_norm 2.6111 (3.4246) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:48:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][150/625] eta 0:04:47 lr 0.000043 wd 0.0500 time 0.6004 (0.6044) data time 0.0011 (0.0035) model time 0.5994 (0.6017) loss 7.1818 (6.8264) grad_norm 2.7429 (3.4156) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:48:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][160/625] eta 0:04:41 lr 0.000043 wd 0.0500 time 0.6005 (0.6043) data time 0.0011 (0.0034) model time 0.5994 (0.6017) loss 7.6367 (6.8492) grad_norm 2.9308 (3.4494) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:49:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][170/625] eta 0:04:34 lr 0.000043 wd 0.0500 time 0.6037 (0.6041) data time 0.0008 (0.0032) model time 0.6029 (0.6015) loss 6.1048 (6.8471) grad_norm 3.1653 (3.3922) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:49:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][180/625] eta 0:04:28 lr 0.000043 wd 0.0500 time 0.6038 (0.6041) data time 0.0010 (0.0031) model time 0.6029 (0.6016) loss 6.1358 (6.8311) grad_norm 2.3505 (3.3278) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:49:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][190/625] eta 0:04:22 lr 0.000043 wd 0.0500 time 0.6015 (0.6041) data time 0.0011 (0.0030) model time 0.6003 (0.6017) loss 5.9532 (6.8266) grad_norm 1.8378 (3.3808) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:49:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][200/625] eta 0:04:16 lr 0.000042 wd 0.0500 time 0.5981 (0.6043) data time 0.0010 (0.0029) model time 0.5971 (0.6021) loss 7.4482 (6.8247) grad_norm 1.8472 (3.3772) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:49:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][210/625] eta 0:04:12 lr 0.000042 wd 0.0500 time 0.6040 (0.6075) data time 0.0012 (0.0028) model time 0.6028 (0.6065) loss 5.8037 (6.8093) grad_norm 4.0003 (3.3426) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:49:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][220/625] eta 0:04:05 lr 0.000042 wd 0.0500 time 0.6026 (0.6073) data time 0.0011 (0.0027) model time 0.6015 (0.6062) loss 7.2488 (6.8109) grad_norm 2.1688 (3.3214) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:49:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][230/625] eta 0:03:59 lr 0.000042 wd 0.0500 time 0.6028 (0.6074) data time 0.0011 (0.0027) model time 0.6018 (0.6063) loss 8.0495 (6.8182) grad_norm 2.4288 (3.2910) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:49:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][240/625] eta 0:03:53 lr 0.000042 wd 0.0500 time 0.5978 (0.6071) data time 0.0008 (0.0026) model time 0.5970 (0.6060) loss 6.7111 (6.8114) grad_norm 2.6357 (3.3774) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:49:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][250/625] eta 0:03:47 lr 0.000042 wd 0.0500 time 0.6023 (0.6070) data time 0.0010 (0.0025) model time 0.6012 (0.6058) loss 7.5281 (6.8246) grad_norm 2.9904 (3.3632) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:49:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][260/625] eta 0:03:41 lr 0.000042 wd 0.0500 time 0.6035 (0.6069) data time 0.0010 (0.0025) model time 0.6025 (0.6056) loss 8.0309 (6.8291) grad_norm 2.2305 (3.3414) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:50:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][270/625] eta 0:03:35 lr 0.000042 wd 0.0500 time 0.6024 (0.6068) data time 0.0008 (0.0024) model time 0.6016 (0.6055) loss 6.0647 (6.8205) grad_norm 2.9047 (3.3282) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:50:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][280/625] eta 0:03:29 lr 0.000042 wd 0.0500 time 0.6003 (0.6067) data time 0.0009 (0.0024) model time 0.5994 (0.6054) loss 5.8780 (6.7983) grad_norm 1.9613 (3.2939) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:50:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][290/625] eta 0:03:23 lr 0.000042 wd 0.0500 time 0.6034 (0.6065) data time 0.0008 (0.0023) model time 0.6025 (0.6053) loss 6.8452 (6.7869) grad_norm 2.9201 (3.2671) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:50:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][300/625] eta 0:03:17 lr 0.000042 wd 0.0500 time 0.6014 (0.6064) data time 0.0009 (0.0023) model time 0.6005 (0.6051) loss 8.3601 (6.7766) grad_norm 3.4080 (3.2467) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:50:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][310/625] eta 0:03:10 lr 0.000042 wd 0.0500 time 0.6032 (0.6063) data time 0.0008 (0.0022) model time 0.6024 (0.6049) loss 7.0081 (6.7684) grad_norm 2.8007 (3.2403) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:50:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][320/625] eta 0:03:04 lr 0.000042 wd 0.0500 time 0.6045 (0.6062) data time 0.0008 (0.0022) model time 0.6037 (0.6049) loss 6.2635 (6.7659) grad_norm 3.2136 (3.2215) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:50:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][330/625] eta 0:02:58 lr 0.000042 wd 0.0500 time 0.6018 (0.6061) data time 0.0008 (0.0022) model time 0.6010 (0.6048) loss 7.1034 (6.7631) grad_norm 7.6571 (3.2184) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:50:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][340/625] eta 0:02:52 lr 0.000042 wd 0.0500 time 0.6064 (0.6061) data time 0.0010 (0.0021) model time 0.6053 (0.6048) loss 7.1168 (6.7503) grad_norm 2.8686 (3.1984) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:50:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][350/625] eta 0:02:46 lr 0.000042 wd 0.0500 time 0.5997 (0.6060) data time 0.0008 (0.0021) model time 0.5989 (0.6047) loss 7.7135 (6.7420) grad_norm 2.1984 (3.1889) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:50:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][360/625] eta 0:02:40 lr 0.000042 wd 0.0500 time 0.6025 (0.6059) data time 0.0010 (0.0021) model time 0.6015 (0.6046) loss 7.3089 (6.7480) grad_norm 1.9957 (3.1667) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:51:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][370/625] eta 0:02:34 lr 0.000042 wd 0.0500 time 0.5899 (0.6057) data time 0.0008 (0.0020) model time 0.5891 (0.6044) loss 6.6910 (6.7519) grad_norm 2.0690 (3.1429) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:51:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][380/625] eta 0:02:28 lr 0.000042 wd 0.0500 time 0.6014 (0.6056) data time 0.0011 (0.0020) model time 0.6002 (0.6042) loss 6.2693 (6.7453) grad_norm 2.2605 (3.1380) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:51:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][390/625] eta 0:02:22 lr 0.000042 wd 0.0500 time 0.6014 (0.6054) data time 0.0011 (0.0020) model time 0.6003 (0.6041) loss 7.8773 (6.7421) grad_norm 2.3545 (3.1251) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:51:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][400/625] eta 0:02:16 lr 0.000042 wd 0.0500 time 0.6017 (0.6055) data time 0.0010 (0.0020) model time 0.6008 (0.6041) loss 5.4453 (6.7415) grad_norm 2.2467 (3.1106) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:51:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][410/625] eta 0:02:10 lr 0.000042 wd 0.0500 time 0.6021 (0.6054) data time 0.0009 (0.0019) model time 0.6012 (0.6041) loss 6.5743 (6.7406) grad_norm 2.3339 (3.1131) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:51:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][420/625] eta 0:02:04 lr 0.000042 wd 0.0500 time 0.6011 (0.6057) data time 0.0013 (0.0019) model time 0.5998 (0.6044) loss 8.3464 (6.7474) grad_norm 2.6905 (3.1452) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:51:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][430/625] eta 0:01:58 lr 0.000042 wd 0.0500 time 0.5995 (0.6068) data time 0.0011 (0.0019) model time 0.5983 (0.6057) loss 8.1075 (6.7384) grad_norm 3.5197 (3.1404) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:51:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][440/625] eta 0:01:52 lr 0.000042 wd 0.0500 time 0.6019 (0.6067) data time 0.0008 (0.0019) model time 0.6011 (0.6055) loss 7.7486 (6.7466) grad_norm 1.9099 (3.1223) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:51:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][450/625] eta 0:01:46 lr 0.000042 wd 0.0500 time 0.5933 (0.6069) data time 0.0011 (0.0019) model time 0.5921 (0.6057) loss 7.1173 (6.7521) grad_norm 2.0070 (3.1102) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:52:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][460/625] eta 0:01:40 lr 0.000042 wd 0.0500 time 0.5973 (0.6067) data time 0.0009 (0.0018) model time 0.5964 (0.6055) loss 7.6365 (6.7509) grad_norm 2.2398 (3.1132) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:52:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][470/625] eta 0:01:34 lr 0.000042 wd 0.0500 time 0.6005 (0.6066) data time 0.0009 (0.0018) model time 0.5996 (0.6054) loss 5.9706 (6.7366) grad_norm 3.0032 (3.1002) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:52:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][480/625] eta 0:01:27 lr 0.000042 wd 0.0500 time 0.6023 (0.6065) data time 0.0008 (0.0018) model time 0.6015 (0.6054) loss 5.6352 (6.7338) grad_norm 3.1824 (3.0969) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:52:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][490/625] eta 0:01:21 lr 0.000042 wd 0.0500 time 0.6036 (0.6065) data time 0.0008 (0.0018) model time 0.6028 (0.6053) loss 6.5791 (6.7315) grad_norm 2.9721 (3.0850) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:52:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][500/625] eta 0:01:15 lr 0.000041 wd 0.0500 time 0.6023 (0.6064) data time 0.0010 (0.0018) model time 0.6013 (0.6052) loss 5.4700 (6.7209) grad_norm 2.4196 (3.0888) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:52:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][510/625] eta 0:01:09 lr 0.000041 wd 0.0500 time 0.6020 (0.6063) data time 0.0008 (0.0018) model time 0.6012 (0.6051) loss 7.0371 (6.7237) grad_norm 2.3244 (3.0800) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:52:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][520/625] eta 0:01:03 lr 0.000041 wd 0.0500 time 0.6013 (0.6062) data time 0.0008 (0.0017) model time 0.6005 (0.6050) loss 6.9756 (6.7240) grad_norm 2.1886 (3.0712) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:52:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][530/625] eta 0:00:57 lr 0.000041 wd 0.0500 time 0.5994 (0.6062) data time 0.0008 (0.0017) model time 0.5986 (0.6050) loss 5.6057 (6.7251) grad_norm 2.8071 (3.0658) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:52:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][540/625] eta 0:00:51 lr 0.000041 wd 0.0500 time 0.6047 (0.6061) data time 0.0011 (0.0017) model time 0.6037 (0.6049) loss 7.1637 (6.7270) grad_norm 2.3827 (3.0537) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:52:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][550/625] eta 0:00:45 lr 0.000041 wd 0.0500 time 0.6047 (0.6061) data time 0.0008 (0.0017) model time 0.6039 (0.6049) loss 6.6569 (6.7223) grad_norm 2.1397 (3.0368) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:53:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][560/625] eta 0:00:39 lr 0.000041 wd 0.0500 time 0.6037 (0.6061) data time 0.0010 (0.0017) model time 0.6027 (0.6049) loss 7.5295 (6.7212) grad_norm 2.6051 (3.0319) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:53:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][570/625] eta 0:00:33 lr 0.000041 wd 0.0500 time 0.6044 (0.6060) data time 0.0009 (0.0017) model time 0.6035 (0.6048) loss 6.1565 (6.7231) grad_norm 2.0386 (3.0242) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:53:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][580/625] eta 0:00:27 lr 0.000041 wd 0.0500 time 0.6005 (0.6060) data time 0.0010 (0.0017) model time 0.5995 (0.6048) loss 6.3424 (6.7252) grad_norm 2.2703 (3.0362) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:53:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][590/625] eta 0:00:21 lr 0.000041 wd 0.0500 time 0.5993 (0.6059) data time 0.0009 (0.0017) model time 0.5984 (0.6048) loss 5.6173 (6.7177) grad_norm 1.7870 (3.0314) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:53:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][600/625] eta 0:00:15 lr 0.000041 wd 0.0500 time 0.6011 (0.6059) data time 0.0011 (0.0017) model time 0.6001 (0.6047) loss 6.8661 (6.7164) grad_norm 2.3394 (3.0379) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:53:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][610/625] eta 0:00:09 lr 0.000041 wd 0.0500 time 0.6005 (0.6059) data time 0.0008 (0.0016) model time 0.5996 (0.6047) loss 5.7726 (6.7175) grad_norm 2.0897 (3.0304) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:53:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [271/300][620/625] eta 0:00:03 lr 0.000041 wd 0.0500 time 0.6056 (0.6058) data time 0.0006 (0.0016) model time 0.6050 (0.6047) loss 7.0562 (6.7209) grad_norm 2.0371 (3.0210) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:53:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 271 training takes 0:06:18 +[2024-07-29 05:53:39 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 05:53:41 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 05:53:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.544 (0.544) Loss 0.4958 (0.4958) Acc@1 90.332 (90.332) Acc@5 98.926 (98.926) Mem 22339MB +[2024-07-29 05:53:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.164) Loss 0.7354 (0.5956) Acc@1 83.057 (88.179) Acc@5 96.973 (98.113) Mem 22339MB +[2024-07-29 05:53:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.127 (0.146) Loss 0.8159 (0.6800) Acc@1 81.689 (85.631) Acc@5 96.338 (97.403) Mem 22339MB +[2024-07-29 05:53:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.277 Acc@5 97.385 +[2024-07-29 05:53:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.3% +[2024-07-29 05:53:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 85.28% +[2024-07-29 05:53:44 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-29 05:53:47 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-29 05:53:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.517 (0.517) Loss 0.4961 (0.4961) Acc@1 90.332 (90.332) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-29 05:53:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.161) Loss 0.7344 (0.6001) Acc@1 83.057 (88.224) Acc@5 97.070 (98.176) Mem 22339MB +[2024-07-29 05:53:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.127 (0.145) Loss 0.8203 (0.6842) Acc@1 81.250 (85.621) Acc@5 96.436 (97.417) Mem 22339MB +[2024-07-29 05:53:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.221 Acc@5 97.419 +[2024-07-29 05:53:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.2% +[2024-07-29 05:53:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.22% +[2024-07-29 05:53:51 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 05:53:54 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 05:53:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][0/625] eta 0:10:07 lr 0.000041 wd 0.0500 time 0.9723 (0.9723) data time 0.4364 (0.4364) model time 0.0000 (0.0000) loss 5.9127 (5.9127) grad_norm 3.0316 (3.0316) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:54:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][10/625] eta 0:06:30 lr 0.000041 wd 0.0500 time 0.5939 (0.6356) data time 0.0010 (0.0407) model time 0.0000 (0.0000) loss 6.7235 (6.4922) grad_norm 4.1270 (2.6299) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:54:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][20/625] eta 0:06:27 lr 0.000041 wd 0.0500 time 0.7349 (0.6408) data time 0.0010 (0.0218) model time 0.0000 (0.0000) loss 7.5662 (6.7317) grad_norm 1.9919 (2.5653) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:54:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][30/625] eta 0:06:16 lr 0.000041 wd 0.0500 time 0.6023 (0.6334) data time 0.0010 (0.0151) model time 0.0000 (0.0000) loss 6.8660 (6.8133) grad_norm 1.7804 (2.5889) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 05:54:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 05:54:16 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 05:54:20 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 05:56:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 05:56:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 05:57:34 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 05:57:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 05:57:44 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 05:57:45 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 05:57:45 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 05:57:45 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 272) +[2024-07-29 05:57:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 05:58:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][40/625] eta 0:21:54 lr 0.000041 wd 0.0500 time 0.5428 (2.2474) data time 0.0008 (0.1330) model time 0.0000 (0.0000) loss 6.7276 (7.0142) grad_norm 4.1663 (3.3977) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 05:58:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][50/625] eta 0:12:21 lr 0.000041 wd 0.0500 time 0.5192 (1.2901) data time 0.0008 (0.0597) model time 0.0000 (0.0000) loss 7.2471 (6.8755) grad_norm 2.6826 (5.0362) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 05:58:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][60/625] eta 0:09:35 lr 0.000041 wd 0.0500 time 0.5208 (1.0179) data time 0.0016 (0.0388) model time 0.5193 (0.5268) loss 7.5584 (6.9577) grad_norm 2.6030 (4.2967) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 05:58:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][70/625] eta 0:08:16 lr 0.000041 wd 0.0500 time 0.5171 (0.8954) data time 0.0009 (0.0289) model time 0.5161 (0.5392) loss 6.8967 (6.8822) grad_norm 3.6474 (3.8204) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 05:58:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][80/625] eta 0:07:27 lr 0.000041 wd 0.0500 time 0.6246 (0.8218) data time 0.0009 (0.0231) model time 0.6236 (0.5396) loss 7.2550 (6.8421) grad_norm 3.1285 (3.9829) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 05:58:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][90/625] eta 0:06:56 lr 0.000041 wd 0.0500 time 0.5183 (0.7792) data time 0.0008 (0.0198) model time 0.5175 (0.5474) loss 5.8538 (6.8148) grad_norm 2.4500 (3.7769) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 05:58:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][100/625] eta 0:06:30 lr 0.000041 wd 0.0500 time 0.5194 (0.7446) data time 0.0007 (0.0171) model time 0.5187 (0.5466) loss 6.4501 (6.7541) grad_norm 2.6802 (3.6036) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 05:58:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][110/625] eta 0:06:09 lr 0.000041 wd 0.0500 time 0.5442 (0.7171) data time 0.0007 (0.0150) model time 0.5435 (0.5436) loss 6.6814 (6.7710) grad_norm 3.2530 (4.4014) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 05:58:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][120/625] eta 0:05:52 lr 0.000041 wd 0.0500 time 0.5272 (0.6977) data time 0.0009 (0.0134) model time 0.5262 (0.5438) loss 6.8857 (6.7483) grad_norm 63.9925 (4.8807) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 05:58:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][130/625] eta 0:05:37 lr 0.000041 wd 0.0500 time 0.5181 (0.6810) data time 0.0006 (0.0122) model time 0.5175 (0.5425) loss 7.0903 (6.7642) grad_norm 1.8325 (4.6204) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 05:59:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][140/625] eta 0:05:24 lr 0.000041 wd 0.0500 time 0.5471 (0.6681) data time 0.0007 (0.0113) model time 0.5464 (0.5421) loss 5.0699 (6.7727) grad_norm 2.0591 (4.4681) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 05:59:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][150/625] eta 0:05:12 lr 0.000041 wd 0.0500 time 0.5370 (0.6573) data time 0.0011 (0.0107) model time 0.5359 (0.5415) loss 6.1658 (6.7682) grad_norm 2.4064 (4.3291) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 05:59:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][160/625] eta 0:05:01 lr 0.000041 wd 0.0500 time 0.6168 (0.6487) data time 0.0007 (0.0100) model time 0.6161 (0.5419) loss 6.7810 (6.7584) grad_norm 2.9094 (4.2105) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 05:59:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][170/625] eta 0:04:51 lr 0.000041 wd 0.0500 time 0.5378 (0.6404) data time 0.0016 (0.0093) model time 0.5362 (0.5412) loss 6.5124 (6.7403) grad_norm 2.0091 (4.1107) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 05:59:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][180/625] eta 0:04:41 lr 0.000040 wd 0.0500 time 0.5174 (0.6330) data time 0.0007 (0.0088) model time 0.5167 (0.5403) loss 6.2552 (6.7211) grad_norm 2.1684 (3.9811) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 05:59:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][190/625] eta 0:04:32 lr 0.000040 wd 0.0500 time 0.5583 (0.6264) data time 0.0011 (0.0083) model time 0.5571 (0.5394) loss 5.6011 (6.6976) grad_norm 2.2790 (3.8773) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 05:59:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][200/625] eta 0:04:23 lr 0.000040 wd 0.0500 time 0.5232 (0.6202) data time 0.0010 (0.0079) model time 0.5222 (0.5382) loss 7.7125 (6.7199) grad_norm 2.1769 (3.7824) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 05:59:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][210/625] eta 0:04:15 lr 0.000040 wd 0.0500 time 0.5206 (0.6153) data time 0.0008 (0.0075) model time 0.5198 (0.5377) loss 5.7694 (6.7046) grad_norm 4.4574 (3.7304) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 05:59:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][220/625] eta 0:04:07 lr 0.000040 wd 0.0500 time 0.5457 (0.6112) data time 0.0007 (0.0072) model time 0.5450 (0.5377) loss 6.4691 (6.7128) grad_norm 7.1796 (3.6863) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 05:59:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][230/625] eta 0:04:00 lr 0.000040 wd 0.0500 time 0.6786 (0.6078) data time 0.0010 (0.0069) model time 0.6776 (0.5380) loss 5.8910 (6.6963) grad_norm 2.1878 (3.6348) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 05:59:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][240/625] eta 0:03:52 lr 0.000040 wd 0.0500 time 0.5229 (0.6039) data time 0.0009 (0.0066) model time 0.5220 (0.5373) loss 7.4334 (6.6851) grad_norm 3.0027 (3.5802) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 06:00:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][250/625] eta 0:03:45 lr 0.000040 wd 0.0500 time 0.5326 (0.6008) data time 0.0010 (0.0064) model time 0.5317 (0.5372) loss 5.4028 (6.6723) grad_norm 15.5258 (3.5944) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 06:00:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][260/625] eta 0:03:38 lr 0.000040 wd 0.0500 time 0.5530 (0.5977) data time 0.0009 (0.0061) model time 0.5521 (0.5369) loss 8.2534 (6.6843) grad_norm 2.1201 (3.5391) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 06:00:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][270/625] eta 0:03:31 lr 0.000040 wd 0.0500 time 0.5173 (0.5949) data time 0.0007 (0.0059) model time 0.5166 (0.5366) loss 6.4930 (6.6830) grad_norm 2.6588 (3.4882) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 06:00:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][280/625] eta 0:03:24 lr 0.000040 wd 0.0500 time 0.5281 (0.5926) data time 0.0009 (0.0057) model time 0.5272 (0.5365) loss 5.8771 (6.6744) grad_norm 1.8023 (3.6357) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 06:00:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][290/625] eta 0:03:17 lr 0.000040 wd 0.0500 time 0.5212 (0.5902) data time 0.0009 (0.0056) model time 0.5203 (0.5362) loss 7.0640 (6.6588) grad_norm 2.1519 (3.5963) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 06:00:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][300/625] eta 0:03:11 lr 0.000040 wd 0.0500 time 0.5496 (0.5880) data time 0.0008 (0.0054) model time 0.5488 (0.5360) loss 7.2981 (6.6524) grad_norm 1.7929 (3.5551) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 06:00:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][310/625] eta 0:03:05 lr 0.000040 wd 0.0500 time 0.5708 (0.5878) data time 0.0010 (0.0052) model time 0.5698 (0.5377) loss 5.3579 (6.6597) grad_norm 2.2955 (3.5195) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 06:00:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][320/625] eta 0:02:58 lr 0.000040 wd 0.0500 time 0.5286 (0.5861) data time 0.0006 (0.0051) model time 0.5279 (0.5377) loss 6.6102 (6.6725) grad_norm 2.1693 (3.5008) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 06:00:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][330/625] eta 0:02:52 lr 0.000040 wd 0.0500 time 0.5186 (0.5846) data time 0.0010 (0.0050) model time 0.5177 (0.5378) loss 5.8645 (6.6584) grad_norm 2.5794 (3.4783) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 06:00:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][340/625] eta 0:02:46 lr 0.000040 wd 0.0500 time 0.5266 (0.5830) data time 0.0007 (0.0049) model time 0.5259 (0.5376) loss 6.0275 (6.6618) grad_norm 2.7045 (3.5179) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 06:00:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][350/625] eta 0:02:39 lr 0.000040 wd 0.0500 time 0.5277 (0.5813) data time 0.0010 (0.0047) model time 0.5267 (0.5373) loss 6.5408 (6.6788) grad_norm 2.7317 (3.4887) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 06:01:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][360/625] eta 0:02:33 lr 0.000040 wd 0.0500 time 0.5669 (0.5798) data time 0.0011 (0.0046) model time 0.5658 (0.5371) loss 6.6504 (6.6903) grad_norm 2.4709 (3.4572) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 06:01:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][370/625] eta 0:02:27 lr 0.000040 wd 0.0500 time 0.5183 (0.5780) data time 0.0012 (0.0045) model time 0.5172 (0.5365) loss 8.2191 (6.6822) grad_norm 2.9761 (3.4433) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 06:01:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 06:01:07 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 06:01:12 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 06:05:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 06:05:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 06:05:54 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 06:06:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 06:06:03 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 06:06:04 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 06:06:04 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 06:06:04 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 272) +[2024-07-29 06:06:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 06:06:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][380/625] eta 0:07:38 lr 0.000040 wd 0.0500 time 0.5735 (1.8695) data time 0.0007 (0.1335) model time 0.5728 (1.7360) loss 6.1486 (7.1319) grad_norm 2.8369 (4.3233) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:06:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][390/625] eta 0:04:30 lr 0.000040 wd 0.0500 time 0.5740 (1.1497) data time 0.0006 (0.0598) model time 0.5734 (1.0899) loss 7.9591 (7.0635) grad_norm 3.0063 (3.4269) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:06:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][400/625] eta 0:03:32 lr 0.000040 wd 0.0500 time 0.5694 (0.9433) data time 0.0009 (0.0388) model time 0.5685 (0.9045) loss 7.9448 (7.0582) grad_norm 2.7713 (3.1144) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:06:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][410/625] eta 0:03:01 lr 0.000040 wd 0.0500 time 0.5755 (0.8456) data time 0.0008 (0.0288) model time 0.5747 (0.8168) loss 7.0912 (6.9960) grad_norm 2.9617 (2.9179) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:06:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][420/625] eta 0:02:41 lr 0.000040 wd 0.0500 time 0.5719 (0.7889) data time 0.0007 (0.0230) model time 0.5711 (0.7660) loss 6.4658 (6.9358) grad_norm 2.5578 (2.8185) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:06:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][430/625] eta 0:02:27 lr 0.000040 wd 0.0500 time 0.5735 (0.7572) data time 0.0007 (0.0191) model time 0.5729 (0.7381) loss 6.5326 (6.8736) grad_norm 2.5778 (2.8987) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:06:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][440/625] eta 0:02:15 lr 0.000040 wd 0.0500 time 0.5775 (0.7305) data time 0.0006 (0.0164) model time 0.5768 (0.7140) loss 5.2149 (6.8230) grad_norm 2.6354 (2.8272) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:07:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][450/625] eta 0:02:04 lr 0.000040 wd 0.0500 time 0.5784 (0.7108) data time 0.0006 (0.0144) model time 0.5777 (0.6963) loss 5.6999 (6.8037) grad_norm 3.2460 (2.7882) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:07:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][460/625] eta 0:01:54 lr 0.000040 wd 0.0500 time 0.5783 (0.6955) data time 0.0008 (0.0129) model time 0.5775 (0.6826) loss 7.7986 (6.8185) grad_norm 2.0372 (2.7505) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:07:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][470/625] eta 0:01:45 lr 0.000040 wd 0.0500 time 0.5759 (0.6833) data time 0.0007 (0.0117) model time 0.5752 (0.6717) loss 7.0245 (6.8334) grad_norm 1.6766 (2.7240) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:07:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][480/625] eta 0:01:37 lr 0.000040 wd 0.0500 time 0.5768 (0.6733) data time 0.0006 (0.0107) model time 0.5762 (0.6626) loss 5.8871 (6.8384) grad_norm 2.8132 (2.7002) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:07:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][490/625] eta 0:01:29 lr 0.000039 wd 0.0500 time 0.5772 (0.6651) data time 0.0008 (0.0098) model time 0.5764 (0.6552) loss 6.8558 (6.8337) grad_norm 2.4150 (2.6880) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:07:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][500/625] eta 0:01:22 lr 0.000039 wd 0.0500 time 0.5768 (0.6581) data time 0.0006 (0.0091) model time 0.5762 (0.6489) loss 7.1063 (6.8060) grad_norm 1.8999 (2.6750) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:07:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][510/625] eta 0:01:15 lr 0.000039 wd 0.0500 time 0.5810 (0.6523) data time 0.0008 (0.0085) model time 0.5802 (0.6438) loss 6.9248 (6.8003) grad_norm 2.2778 (2.7102) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:07:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][520/625] eta 0:01:07 lr 0.000039 wd 0.0500 time 0.5754 (0.6473) data time 0.0006 (0.0080) model time 0.5748 (0.6393) loss 5.7200 (6.7855) grad_norm 2.5577 (2.6970) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:07:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][530/625] eta 0:01:01 lr 0.000039 wd 0.0500 time 0.5786 (0.6430) data time 0.0006 (0.0075) model time 0.5780 (0.6354) loss 6.2694 (6.7768) grad_norm 1.9882 (2.6951) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:07:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][540/625] eta 0:00:54 lr 0.000039 wd 0.0500 time 0.5775 (0.6391) data time 0.0008 (0.0071) model time 0.5767 (0.6320) loss 7.5065 (6.8090) grad_norm 3.5568 (2.6997) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:08:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][550/625] eta 0:00:47 lr 0.000039 wd 0.0500 time 0.5756 (0.6356) data time 0.0006 (0.0068) model time 0.5749 (0.6288) loss 6.2483 (6.7936) grad_norm 2.1077 (2.7115) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:08:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][560/625] eta 0:00:41 lr 0.000039 wd 0.0500 time 0.5747 (0.6324) data time 0.0006 (0.0065) model time 0.5740 (0.6260) loss 7.2248 (6.7989) grad_norm 3.9943 (2.7094) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:08:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][570/625] eta 0:00:34 lr 0.000039 wd 0.0500 time 0.5760 (0.6295) data time 0.0008 (0.0062) model time 0.5752 (0.6233) loss 5.7722 (6.7773) grad_norm 2.8781 (2.6872) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:08:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][580/625] eta 0:00:28 lr 0.000039 wd 0.0500 time 0.5754 (0.6270) data time 0.0008 (0.0059) model time 0.5746 (0.6211) loss 7.0998 (6.7540) grad_norm 3.1413 (2.6768) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:08:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][590/625] eta 0:00:21 lr 0.000039 wd 0.0500 time 0.5803 (0.6248) data time 0.0008 (0.0057) model time 0.5795 (0.6191) loss 6.2106 (6.7422) grad_norm 2.8860 (2.6831) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:08:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][600/625] eta 0:00:15 lr 0.000039 wd 0.0500 time 0.5792 (0.6228) data time 0.0009 (0.0055) model time 0.5783 (0.6173) loss 7.5512 (6.7474) grad_norm 2.0431 (2.6784) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:08:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][610/625] eta 0:00:09 lr 0.000039 wd 0.0500 time 0.5795 (0.6210) data time 0.0004 (0.0053) model time 0.5791 (0.6157) loss 6.6904 (6.7482) grad_norm 2.2464 (2.6639) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:08:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [272/300][620/625] eta 0:00:03 lr 0.000039 wd 0.0500 time 0.5748 (0.6191) data time 0.0006 (0.0051) model time 0.5742 (0.6140) loss 6.3025 (6.7508) grad_norm 2.3033 (2.7240) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:08:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 272 training takes 0:02:35 +[2024-07-29 06:08:44 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 06:08:49 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 06:08:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.460 (0.460) Loss 0.4910 (0.4910) Acc@1 90.381 (90.381) Acc@5 99.023 (99.023) Mem 22341MB +[2024-07-29 06:08:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.155) Loss 0.7373 (0.5930) Acc@1 83.057 (88.219) Acc@5 97.119 (98.153) Mem 22341MB +[2024-07-29 06:08:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.124 (0.141) Loss 0.8169 (0.6786) Acc@1 81.055 (85.642) Acc@5 96.338 (97.419) Mem 22341MB +[2024-07-29 06:08:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.271 Acc@5 97.397 +[2024-07-29 06:08:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.3% +[2024-07-29 06:08:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.845 (0.845) Loss 0.4956 (0.4956) Acc@1 90.283 (90.283) Acc@5 99.023 (99.023) Mem 22341MB +[2024-07-29 06:08:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.190) Loss 0.7344 (0.5996) Acc@1 83.154 (88.246) Acc@5 97.070 (98.176) Mem 22341MB +[2024-07-29 06:08:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.124 (0.159) Loss 0.8193 (0.6835) Acc@1 81.299 (85.635) Acc@5 96.387 (97.407) Mem 22341MB +[2024-07-29 06:08:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.235 Acc@5 97.413 +[2024-07-29 06:08:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.2% +[2024-07-29 06:08:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.23% +[2024-07-29 06:08:59 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 06:09:02 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 06:12:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 06:12:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 06:12:39 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 06:12:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth +[2024-07-29 06:12:50 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth.................... +[2024-07-29 06:12:50 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 06:12:50 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 06:12:51 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth' (epoch 272) +[2024-07-29 06:12:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 06:13:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][0/625] eta 1:44:34 lr 0.000039 wd 0.0500 time 10.0386 (10.0386) data time 0.8237 (0.8237) model time 0.0000 (0.0000) loss 7.0995 (7.0995) grad_norm 2.6989 (2.6989) loss_scale 256.0000 (256.0000) mem 26016MB +[2024-07-29 06:13:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][10/625] eta 0:15:49 lr 0.000039 wd 0.0500 time 0.5676 (1.5445) data time 0.0009 (0.0757) model time 0.0000 (0.0000) loss 6.3465 (6.9902) grad_norm 2.1813 (2.6495) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:13:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][20/625] eta 0:10:54 lr 0.000039 wd 0.0500 time 0.5651 (1.0825) data time 0.0009 (0.0401) model time 0.0000 (0.0000) loss 6.5719 (6.9614) grad_norm 2.1066 (3.2807) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:13:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][30/625] eta 0:09:05 lr 0.000039 wd 0.0500 time 0.5691 (0.9173) data time 0.0007 (0.0276) model time 0.0000 (0.0000) loss 6.1106 (6.9793) grad_norm 2.3503 (2.9223) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:13:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][40/625] eta 0:08:08 lr 0.000039 wd 0.0500 time 0.5709 (0.8359) data time 0.0008 (0.0211) model time 0.0000 (0.0000) loss 7.0287 (6.8831) grad_norm 2.5458 (2.7857) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:13:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][50/625] eta 0:07:32 lr 0.000039 wd 0.0500 time 0.7612 (0.7875) data time 0.0007 (0.0171) model time 0.0000 (0.0000) loss 7.2832 (6.8624) grad_norm 2.6715 (2.7963) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:13:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][60/625] eta 0:07:06 lr 0.000039 wd 0.0500 time 0.5727 (0.7550) data time 0.0009 (0.0145) model time 0.5718 (0.5880) loss 6.5402 (6.8037) grad_norm 2.2117 (2.8019) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:13:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][70/625] eta 0:06:44 lr 0.000039 wd 0.0500 time 0.5714 (0.7296) data time 0.0009 (0.0126) model time 0.5705 (0.5809) loss 6.6752 (6.7840) grad_norm 2.5135 (2.7644) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:13:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][80/625] eta 0:06:27 lr 0.000039 wd 0.0500 time 0.5728 (0.7104) data time 0.0009 (0.0111) model time 0.5719 (0.5784) loss 6.1412 (6.7610) grad_norm 3.5140 (2.7500) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:13:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][90/625] eta 0:06:12 lr 0.000039 wd 0.0500 time 0.5721 (0.6956) data time 0.0007 (0.0100) model time 0.5715 (0.5774) loss 7.9198 (6.7877) grad_norm 2.4169 (2.7002) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:14:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][100/625] eta 0:05:59 lr 0.000039 wd 0.0500 time 0.5699 (0.6850) data time 0.0008 (0.0091) model time 0.5691 (0.5795) loss 7.5561 (6.8140) grad_norm 4.7614 (2.6912) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:14:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][110/625] eta 0:05:47 lr 0.000039 wd 0.0500 time 0.5874 (0.6752) data time 0.0008 (0.0084) model time 0.5866 (0.5787) loss 6.8274 (6.8281) grad_norm 3.0851 (2.7072) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:14:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][120/625] eta 0:05:36 lr 0.000039 wd 0.0500 time 0.5710 (0.6672) data time 0.0006 (0.0078) model time 0.5703 (0.5785) loss 5.7044 (6.8279) grad_norm 2.5675 (2.6804) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:14:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][130/625] eta 0:05:27 lr 0.000039 wd 0.0500 time 0.5716 (0.6612) data time 0.0011 (0.0073) model time 0.5705 (0.5797) loss 7.3106 (6.8063) grad_norm 3.1525 (2.7502) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:14:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][140/625] eta 0:05:17 lr 0.000039 wd 0.0500 time 0.5763 (0.6554) data time 0.0008 (0.0068) model time 0.5756 (0.5795) loss 6.6566 (6.7807) grad_norm 3.0797 (2.8314) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:14:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][150/625] eta 0:05:09 lr 0.000039 wd 0.0500 time 0.5759 (0.6507) data time 0.0008 (0.0064) model time 0.5751 (0.5799) loss 6.2840 (6.7681) grad_norm 3.3603 (2.8465) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:14:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][160/625] eta 0:05:00 lr 0.000039 wd 0.0500 time 0.5799 (0.6464) data time 0.0009 (0.0061) model time 0.5790 (0.5800) loss 6.6084 (6.7637) grad_norm 2.2574 (2.8260) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:14:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][170/625] eta 0:04:52 lr 0.000039 wd 0.0500 time 0.5705 (0.6427) data time 0.0010 (0.0059) model time 0.5695 (0.5800) loss 6.3621 (6.7611) grad_norm 1.6370 (2.8174) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:14:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][180/625] eta 0:04:44 lr 0.000038 wd 0.0500 time 0.5702 (0.6390) data time 0.0009 (0.0056) model time 0.5693 (0.5796) loss 6.9529 (6.7457) grad_norm 6.8585 (2.8481) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:14:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][190/625] eta 0:04:36 lr 0.000038 wd 0.0500 time 0.5703 (0.6355) data time 0.0008 (0.0054) model time 0.5695 (0.5790) loss 5.5062 (6.7304) grad_norm 1.9527 (2.8339) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:15:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][200/625] eta 0:04:28 lr 0.000038 wd 0.0500 time 0.5713 (0.6324) data time 0.0009 (0.0052) model time 0.5704 (0.5786) loss 7.6499 (6.7201) grad_norm 4.2753 (2.8314) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:15:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][210/625] eta 0:04:21 lr 0.000038 wd 0.0500 time 0.5683 (0.6296) data time 0.0010 (0.0050) model time 0.5674 (0.5782) loss 8.0509 (6.7171) grad_norm 2.1595 (2.8259) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:15:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][220/625] eta 0:04:13 lr 0.000038 wd 0.0500 time 0.5684 (0.6270) data time 0.0008 (0.0048) model time 0.5676 (0.5778) loss 6.7792 (6.7164) grad_norm 3.5379 (2.8235) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:15:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 06:15:15 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 06:15:17 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 06:17:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 06:17:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 06:19:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 06:19:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 06:19:50 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 06:23:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 06:23:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 06:23:29 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 06:23:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 06:23:42 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 06:23:42 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 06:23:42 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 06:23:42 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 273) +[2024-07-29 06:23:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 06:24:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][230/625] eta 0:14:27 lr 0.000038 wd 0.0500 time 0.5809 (2.1952) data time 0.0013 (0.1150) model time 0.5796 (2.0802) loss 6.8347 (7.0341) grad_norm 22.2546 (5.6844) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:24:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][240/625] eta 0:08:00 lr 0.000038 wd 0.0500 time 0.5809 (1.2472) data time 0.0011 (0.0480) model time 0.5799 (1.1992) loss 6.4572 (6.9156) grad_norm 2.1518 (3.9977) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:24:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][250/625] eta 0:06:15 lr 0.000038 wd 0.0500 time 0.5807 (1.0021) data time 0.0008 (0.0306) model time 0.5799 (0.9714) loss 7.6515 (7.0063) grad_norm 1.8723 (3.5239) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:24:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][260/625] eta 0:05:24 lr 0.000038 wd 0.0500 time 0.5840 (0.8890) data time 0.0011 (0.0227) model time 0.5829 (0.8663) loss 6.5128 (6.9667) grad_norm 2.3454 (3.1742) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:24:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][270/625] eta 0:04:52 lr 0.000038 wd 0.0500 time 0.5840 (0.8242) data time 0.0009 (0.0181) model time 0.5831 (0.8061) loss 6.6797 (6.8949) grad_norm 2.0743 (2.9858) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:24:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][280/625] eta 0:04:32 lr 0.000038 wd 0.0500 time 0.5836 (0.7899) data time 0.0011 (0.0151) model time 0.5825 (0.7748) loss 7.8120 (6.8555) grad_norm 2.1320 (2.8257) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:24:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][290/625] eta 0:04:14 lr 0.000038 wd 0.0500 time 0.5902 (0.7600) data time 0.0011 (0.0130) model time 0.5891 (0.7470) loss 7.2047 (6.8086) grad_norm 2.5964 (2.8030) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:24:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][300/625] eta 0:04:00 lr 0.000038 wd 0.0500 time 0.5933 (0.7386) data time 0.0010 (0.0115) model time 0.5923 (0.7271) loss 7.3349 (6.7690) grad_norm 2.1864 (2.7542) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:24:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][310/625] eta 0:03:47 lr 0.000038 wd 0.0500 time 0.5936 (0.7218) data time 0.0011 (0.0103) model time 0.5925 (0.7116) loss 6.2419 (6.7378) grad_norm 2.6936 (2.7075) loss_scale 512.0000 (270.7126) mem 22341MB +[2024-07-29 06:24:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][320/625] eta 0:03:36 lr 0.000038 wd 0.0500 time 0.5865 (0.7084) data time 0.0011 (0.0093) model time 0.5854 (0.6990) loss 7.7833 (6.7467) grad_norm 4.0787 (2.7036) loss_scale 512.0000 (295.5876) mem 22341MB +[2024-07-29 06:25:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][330/625] eta 0:03:25 lr 0.000038 wd 0.0500 time 0.5859 (0.6971) data time 0.0008 (0.0086) model time 0.5851 (0.6885) loss 6.4651 (6.7702) grad_norm 1.8559 (2.7107) loss_scale 512.0000 (315.8131) mem 22341MB +[2024-07-29 06:25:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][340/625] eta 0:03:16 lr 0.000038 wd 0.0500 time 0.5880 (0.6877) data time 0.0010 (0.0079) model time 0.5869 (0.6798) loss 7.1759 (6.7604) grad_norm 2.0073 (2.6753) loss_scale 512.0000 (332.5812) mem 22341MB +[2024-07-29 06:25:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][350/625] eta 0:03:06 lr 0.000038 wd 0.0500 time 0.5852 (0.6799) data time 0.0010 (0.0074) model time 0.5842 (0.6725) loss 7.1243 (6.7429) grad_norm 2.5418 (2.6559) loss_scale 512.0000 (346.7087) mem 22341MB +[2024-07-29 06:25:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][360/625] eta 0:02:58 lr 0.000038 wd 0.0500 time 0.5862 (0.6734) data time 0.0009 (0.0069) model time 0.5852 (0.6665) loss 6.7034 (6.7479) grad_norm 2.4847 (2.7502) loss_scale 512.0000 (358.7737) mem 22341MB +[2024-07-29 06:25:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][370/625] eta 0:02:50 lr 0.000038 wd 0.0500 time 0.5918 (0.6681) data time 0.0010 (0.0065) model time 0.5907 (0.6616) loss 6.8609 (6.7334) grad_norm 2.9694 (2.8219) loss_scale 512.0000 (369.1973) mem 22341MB +[2024-07-29 06:25:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][380/625] eta 0:02:42 lr 0.000038 wd 0.0500 time 0.5914 (0.6634) data time 0.0010 (0.0062) model time 0.5904 (0.6573) loss 5.8876 (6.7400) grad_norm 2.1728 (2.8386) loss_scale 512.0000 (378.2930) mem 22341MB +[2024-07-29 06:25:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][390/625] eta 0:02:34 lr 0.000038 wd 0.0500 time 0.5954 (0.6594) data time 0.0010 (0.0059) model time 0.5944 (0.6536) loss 6.2192 (6.7447) grad_norm 2.8757 (2.8302) loss_scale 512.0000 (386.2994) mem 22341MB +[2024-07-29 06:25:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][400/625] eta 0:02:27 lr 0.000038 wd 0.0500 time 0.5884 (0.6554) data time 0.0010 (0.0056) model time 0.5874 (0.6498) loss 6.2229 (6.7376) grad_norm 2.6861 (2.8136) loss_scale 512.0000 (393.4011) mem 22341MB +[2024-07-29 06:25:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][410/625] eta 0:02:20 lr 0.000038 wd 0.0500 time 0.5889 (0.6520) data time 0.0008 (0.0054) model time 0.5881 (0.6466) loss 6.1007 (6.7341) grad_norm 2.9382 (2.7952) loss_scale 512.0000 (399.7433) mem 22341MB +[2024-07-29 06:25:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][420/625] eta 0:02:13 lr 0.000038 wd 0.0500 time 0.5889 (0.6489) data time 0.0009 (0.0051) model time 0.5880 (0.6438) loss 7.3923 (6.7238) grad_norm 2.1380 (2.7885) loss_scale 512.0000 (405.4416) mem 22341MB +[2024-07-29 06:26:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][430/625] eta 0:02:06 lr 0.000038 wd 0.0500 time 0.5951 (0.6463) data time 0.0008 (0.0049) model time 0.5943 (0.6413) loss 6.3937 (6.7004) grad_norm 4.3177 (2.7822) loss_scale 512.0000 (410.5894) mem 22341MB +[2024-07-29 06:26:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][440/625] eta 0:01:59 lr 0.000038 wd 0.0500 time 0.5954 (0.6439) data time 0.0008 (0.0048) model time 0.5946 (0.6392) loss 7.0290 (6.6916) grad_norm 1.5762 (2.7591) loss_scale 512.0000 (415.2627) mem 22341MB +[2024-07-29 06:26:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][450/625] eta 0:01:52 lr 0.000038 wd 0.0500 time 0.5924 (0.6418) data time 0.0011 (0.0046) model time 0.5913 (0.6372) loss 6.6221 (6.7058) grad_norm 3.0889 (2.7573) loss_scale 512.0000 (419.5242) mem 22341MB +[2024-07-29 06:26:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][460/625] eta 0:01:45 lr 0.000038 wd 0.0500 time 0.5977 (0.6399) data time 0.0008 (0.0045) model time 0.5969 (0.6355) loss 6.4773 (6.7077) grad_norm 2.4940 (2.7458) loss_scale 512.0000 (423.4262) mem 22341MB +[2024-07-29 06:26:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][470/625] eta 0:01:38 lr 0.000038 wd 0.0500 time 0.5865 (0.6382) data time 0.0010 (0.0043) model time 0.5854 (0.6339) loss 6.9648 (6.6982) grad_norm 8.5308 (2.7933) loss_scale 512.0000 (427.0121) mem 22341MB +[2024-07-29 06:26:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][480/625] eta 0:01:32 lr 0.000038 wd 0.0500 time 0.5861 (0.6364) data time 0.0008 (0.0042) model time 0.5853 (0.6322) loss 5.8489 (6.6820) grad_norm 1.9864 (2.7794) loss_scale 512.0000 (430.3191) mem 22341MB +[2024-07-29 06:26:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][490/625] eta 0:01:25 lr 0.000038 wd 0.0500 time 0.5882 (0.6346) data time 0.0008 (0.0041) model time 0.5874 (0.6306) loss 5.8952 (6.6759) grad_norm 4.9054 (2.7920) loss_scale 512.0000 (433.3783) mem 22341MB +[2024-07-29 06:26:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][500/625] eta 0:01:19 lr 0.000037 wd 0.0500 time 0.5854 (0.6341) data time 0.0010 (0.0040) model time 0.5844 (0.6302) loss 7.1664 (6.6824) grad_norm 3.0922 (2.8039) loss_scale 512.0000 (436.2166) mem 22341MB +[2024-07-29 06:26:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][510/625] eta 0:01:12 lr 0.000037 wd 0.0500 time 0.5912 (0.6329) data time 0.0008 (0.0039) model time 0.5904 (0.6290) loss 7.0852 (6.6779) grad_norm 2.2027 (2.8275) loss_scale 512.0000 (438.8571) mem 22341MB +[2024-07-29 06:26:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][520/625] eta 0:01:06 lr 0.000037 wd 0.0500 time 0.5947 (0.6317) data time 0.0010 (0.0038) model time 0.5936 (0.6279) loss 5.0670 (6.6698) grad_norm 2.3648 (2.8288) loss_scale 512.0000 (441.3199) mem 22341MB +[2024-07-29 06:27:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][530/625] eta 0:00:59 lr 0.000037 wd 0.0500 time 0.5892 (0.6305) data time 0.0008 (0.0037) model time 0.5884 (0.6268) loss 7.6537 (6.6691) grad_norm 2.3434 (2.8188) loss_scale 512.0000 (443.6221) mem 22341MB +[2024-07-29 06:27:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][540/625] eta 0:00:53 lr 0.000037 wd 0.0500 time 0.5837 (0.6292) data time 0.0010 (0.0036) model time 0.5827 (0.6256) loss 7.8661 (6.6822) grad_norm 3.1967 (2.8037) loss_scale 512.0000 (445.7792) mem 22341MB +[2024-07-29 06:27:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][550/625] eta 0:00:47 lr 0.000037 wd 0.0500 time 0.5911 (0.6281) data time 0.0008 (0.0035) model time 0.5902 (0.6246) loss 5.3212 (6.6889) grad_norm 2.9546 (2.7999) loss_scale 512.0000 (447.8043) mem 22341MB +[2024-07-29 06:27:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][560/625] eta 0:00:40 lr 0.000037 wd 0.0500 time 0.5904 (0.6271) data time 0.0011 (0.0035) model time 0.5893 (0.6236) loss 5.6106 (6.6902) grad_norm 15.7157 (2.8345) loss_scale 512.0000 (449.7092) mem 22341MB +[2024-07-29 06:27:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][570/625] eta 0:00:34 lr 0.000037 wd 0.0500 time 0.5888 (0.6261) data time 0.0008 (0.0034) model time 0.5880 (0.6227) loss 6.6602 (6.7050) grad_norm 3.0923 (2.8247) loss_scale 512.0000 (451.5043) mem 22341MB +[2024-07-29 06:27:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][580/625] eta 0:00:28 lr 0.000037 wd 0.0500 time 0.5920 (0.6253) data time 0.0011 (0.0034) model time 0.5909 (0.6220) loss 5.6620 (6.7001) grad_norm 2.0049 (2.8499) loss_scale 512.0000 (453.1989) mem 22341MB +[2024-07-29 06:27:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][590/625] eta 0:00:21 lr 0.000037 wd 0.0500 time 0.5982 (0.6247) data time 0.0008 (0.0033) model time 0.5974 (0.6214) loss 6.2602 (6.6947) grad_norm 6.7611 (2.8541) loss_scale 512.0000 (454.8011) mem 22341MB +[2024-07-29 06:27:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][600/625] eta 0:00:15 lr 0.000037 wd 0.0500 time 0.5898 (0.6242) data time 0.0010 (0.0033) model time 0.5888 (0.6209) loss 6.4794 (6.6934) grad_norm 2.3187 (2.8530) loss_scale 512.0000 (456.3183) mem 22341MB +[2024-07-29 06:27:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][610/625] eta 0:00:09 lr 0.000037 wd 0.0500 time 0.5919 (0.6238) data time 0.0005 (0.0033) model time 0.5914 (0.6206) loss 6.8448 (6.6858) grad_norm 2.2091 (2.8451) loss_scale 512.0000 (457.7571) mem 22341MB +[2024-07-29 06:27:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [273/300][620/625] eta 0:00:03 lr 0.000037 wd 0.0500 time 0.5863 (0.6230) data time 0.0005 (0.0032) model time 0.5858 (0.6198) loss 6.1060 (6.6887) grad_norm 2.8724 (2.8379) loss_scale 512.0000 (459.1234) mem 22341MB +[2024-07-29 06:27:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 273 training takes 0:04:09 +[2024-07-29 06:27:57 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 06:28:04 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 06:28:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.523 (0.523) Loss 0.4958 (0.4958) Acc@1 90.186 (90.186) Acc@5 99.023 (99.023) Mem 22341MB +[2024-07-29 06:28:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.162) Loss 0.7373 (0.5953) Acc@1 82.910 (88.144) Acc@5 97.168 (98.145) Mem 22341MB +[2024-07-29 06:28:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.145) Loss 0.8149 (0.6795) Acc@1 81.445 (85.600) Acc@5 96.338 (97.410) Mem 22341MB +[2024-07-29 06:28:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.251 Acc@5 97.393 +[2024-07-29 06:28:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.3% +[2024-07-29 06:28:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.828 (0.828) Loss 0.4956 (0.4956) Acc@1 90.332 (90.332) Acc@5 99.023 (99.023) Mem 22341MB +[2024-07-29 06:28:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.189) Loss 0.7344 (0.5992) Acc@1 83.301 (88.250) Acc@5 97.119 (98.171) Mem 22341MB +[2024-07-29 06:28:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.159) Loss 0.8188 (0.6832) Acc@1 81.348 (85.631) Acc@5 96.387 (97.400) Mem 22341MB +[2024-07-29 06:28:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.231 Acc@5 97.405 +[2024-07-29 06:28:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.2% +[2024-07-29 06:28:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.23% +[2024-07-29 06:28:13 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 06:28:19 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 06:28:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][0/625] eta 0:13:42 lr 0.000037 wd 0.0500 time 1.3163 (1.3163) data time 0.3965 (0.3965) model time 0.0000 (0.0000) loss 6.9655 (6.9655) grad_norm 2.5037 (2.5037) loss_scale 512.0000 (512.0000) mem 22337MB +[2024-07-29 06:28:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][10/625] eta 0:06:45 lr 0.000037 wd 0.0500 time 0.5895 (0.6590) data time 0.0011 (0.0386) model time 0.0000 (0.0000) loss 7.5232 (6.8516) grad_norm 2.1554 (3.9222) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 06:28:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][20/625] eta 0:06:22 lr 0.000037 wd 0.0500 time 0.5842 (0.6319) data time 0.0008 (0.0207) model time 0.0000 (0.0000) loss 6.3433 (6.7158) grad_norm 2.3022 (3.4780) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 06:28:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][30/625] eta 0:06:10 lr 0.000037 wd 0.0500 time 0.5905 (0.6230) data time 0.0008 (0.0144) model time 0.0000 (0.0000) loss 6.4128 (6.7584) grad_norm 2.3924 (3.4489) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 06:28:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][40/625] eta 0:05:59 lr 0.000037 wd 0.0500 time 0.5859 (0.6145) data time 0.0008 (0.0113) model time 0.0000 (0.0000) loss 6.0525 (6.7233) grad_norm 1.8806 (3.1737) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 06:28:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][50/625] eta 0:05:51 lr 0.000037 wd 0.0500 time 0.5891 (0.6106) data time 0.0010 (0.0093) model time 0.0000 (0.0000) loss 6.4096 (6.6979) grad_norm 13.9374 (3.4120) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 06:28:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][60/625] eta 0:05:43 lr 0.000037 wd 0.0500 time 0.5960 (0.6072) data time 0.0008 (0.0080) model time 0.5952 (0.5889) loss 5.2621 (6.5857) grad_norm 3.9243 (3.3209) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 06:29:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][70/625] eta 0:05:35 lr 0.000037 wd 0.0500 time 0.5875 (0.6047) data time 0.0011 (0.0070) model time 0.5864 (0.5887) loss 6.9794 (6.5463) grad_norm 1.8335 (3.5484) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 06:29:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][80/625] eta 0:05:29 lr 0.000037 wd 0.0500 time 0.6172 (0.6049) data time 0.0008 (0.0063) model time 0.6164 (0.5941) loss 7.8342 (6.5425) grad_norm 2.4816 (3.4213) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 06:29:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][90/625] eta 0:05:22 lr 0.000037 wd 0.0500 time 0.5872 (0.6032) data time 0.0011 (0.0058) model time 0.5862 (0.5926) loss 7.5122 (6.6102) grad_norm 3.3424 (3.3684) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 06:29:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][100/625] eta 0:05:17 lr 0.000037 wd 0.0500 time 0.5908 (0.6040) data time 0.0010 (0.0053) model time 0.5898 (0.5961) loss 6.6439 (6.6050) grad_norm 2.1924 (3.2827) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 06:29:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][110/625] eta 0:05:10 lr 0.000037 wd 0.0500 time 0.5923 (0.6027) data time 0.0007 (0.0049) model time 0.5916 (0.5949) loss 8.3315 (6.6712) grad_norm 2.5011 (3.1902) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 06:29:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][120/625] eta 0:05:03 lr 0.000037 wd 0.0500 time 0.5892 (0.6016) data time 0.0008 (0.0046) model time 0.5884 (0.5939) loss 5.3231 (6.6652) grad_norm 2.0243 (3.1104) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 06:29:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][130/625] eta 0:04:57 lr 0.000037 wd 0.0500 time 0.6069 (0.6007) data time 0.0010 (0.0043) model time 0.6059 (0.5932) loss 6.5122 (6.6729) grad_norm 1.8993 (inf) loss_scale 256.0000 (498.3206) mem 22339MB +[2024-07-29 06:29:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][140/625] eta 0:04:50 lr 0.000037 wd 0.0500 time 0.5927 (0.6000) data time 0.0008 (0.0041) model time 0.5919 (0.5928) loss 5.6124 (6.6603) grad_norm 2.1391 (inf) loss_scale 256.0000 (481.1348) mem 22339MB +[2024-07-29 06:29:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][150/625] eta 0:04:44 lr 0.000037 wd 0.0500 time 0.6015 (0.5996) data time 0.0007 (0.0039) model time 0.6008 (0.5928) loss 7.0929 (6.6508) grad_norm 3.9090 (inf) loss_scale 256.0000 (466.2252) mem 22339MB +[2024-07-29 06:29:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][160/625] eta 0:04:38 lr 0.000037 wd 0.0500 time 0.5887 (0.5990) data time 0.0008 (0.0038) model time 0.5879 (0.5924) loss 7.8487 (6.6764) grad_norm 2.1645 (inf) loss_scale 256.0000 (453.1677) mem 22339MB +[2024-07-29 06:30:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][170/625] eta 0:04:32 lr 0.000037 wd 0.0500 time 0.5892 (0.5984) data time 0.0010 (0.0036) model time 0.5882 (0.5921) loss 6.5111 (6.6731) grad_norm 2.1413 (inf) loss_scale 256.0000 (441.6374) mem 22339MB +[2024-07-29 06:30:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][180/625] eta 0:04:26 lr 0.000037 wd 0.0500 time 0.5903 (0.5978) data time 0.0008 (0.0035) model time 0.5895 (0.5917) loss 7.4115 (6.6952) grad_norm 2.0702 (inf) loss_scale 256.0000 (431.3812) mem 22339MB +[2024-07-29 06:30:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][190/625] eta 0:04:19 lr 0.000037 wd 0.0500 time 0.5894 (0.5973) data time 0.0009 (0.0033) model time 0.5884 (0.5913) loss 7.3254 (6.7084) grad_norm 2.5548 (inf) loss_scale 256.0000 (422.1990) mem 22339MB +[2024-07-29 06:30:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][200/625] eta 0:04:13 lr 0.000036 wd 0.0500 time 0.5994 (0.5970) data time 0.0008 (0.0032) model time 0.5986 (0.5912) loss 6.3854 (6.7054) grad_norm 3.4562 (inf) loss_scale 256.0000 (413.9303) mem 22339MB +[2024-07-29 06:30:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][210/625] eta 0:04:07 lr 0.000036 wd 0.0500 time 0.5956 (0.5968) data time 0.0010 (0.0031) model time 0.5947 (0.5912) loss 6.1457 (6.7025) grad_norm 2.0731 (inf) loss_scale 256.0000 (406.4455) mem 22339MB +[2024-07-29 06:30:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][220/625] eta 0:04:01 lr 0.000036 wd 0.0500 time 0.5955 (0.5966) data time 0.0010 (0.0030) model time 0.5945 (0.5913) loss 7.2379 (6.7136) grad_norm 1.5940 (inf) loss_scale 256.0000 (399.6380) mem 22339MB +[2024-07-29 06:30:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][230/625] eta 0:03:55 lr 0.000036 wd 0.0500 time 0.5892 (0.5964) data time 0.0011 (0.0029) model time 0.5881 (0.5912) loss 7.1380 (6.7235) grad_norm 2.3400 (inf) loss_scale 256.0000 (393.4199) mem 22339MB +[2024-07-29 06:30:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 06:30:41 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 06:30:43 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 06:32:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 06:32:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 06:41:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 06:41:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 06:41:42 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 06:42:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 06:42:17 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 06:42:18 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 06:42:18 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 06:42:18 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 274) +[2024-07-29 06:42:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 06:42:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][240/625] eta 0:23:35 lr 0.000036 wd 0.0500 time 0.5753 (3.6764) data time 0.0007 (0.2184) model time 0.5746 (3.4580) loss 6.4356 (6.8719) grad_norm 2.0382 (4.3410) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 06:42:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][250/625] eta 0:08:10 lr 0.000036 wd 0.0500 time 0.5770 (1.3071) data time 0.0008 (0.0511) model time 0.5762 (1.2561) loss 6.9849 (6.8858) grad_norm 1.9911 (2.9749) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 06:42:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][260/625] eta 0:06:01 lr 0.000036 wd 0.0500 time 0.5749 (0.9898) data time 0.0006 (0.0293) model time 0.5743 (0.9605) loss 7.3876 (6.9412) grad_norm 1.9812 (3.4239) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 06:42:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][270/625] eta 0:05:08 lr 0.000036 wd 0.0500 time 0.5755 (0.8704) data time 0.0007 (0.0206) model time 0.5749 (0.8498) loss 7.3194 (6.9496) grad_norm 1.7124 (3.1464) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 06:42:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 06:42:54 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 06:42:59 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 06:52:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 06:52:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 06:52:31 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 06:52:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 06:52:44 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 06:52:45 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 06:52:45 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 06:52:45 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 274) +[2024-07-29 06:52:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 06:53:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][280/625] eta 0:14:32 lr 0.000036 wd 0.0500 time 0.5625 (2.5294) data time 0.0007 (0.1380) model time 0.5619 (2.3915) loss 7.0189 (7.0747) grad_norm 2.9051 (2.5291) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:53:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][290/625] eta 0:06:48 lr 0.000036 wd 0.0500 time 0.5635 (1.2189) data time 0.0009 (0.0466) model time 0.5626 (1.1723) loss 7.0198 (6.8968) grad_norm 1.8348 (2.5187) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:53:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][300/625] eta 0:05:11 lr 0.000036 wd 0.0500 time 0.5625 (0.9569) data time 0.0009 (0.0283) model time 0.5616 (0.9286) loss 7.1630 (6.8520) grad_norm 2.0977 (2.6038) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:53:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][310/625] eta 0:04:26 lr 0.000036 wd 0.0500 time 0.5619 (0.8446) data time 0.0009 (0.0205) model time 0.5610 (0.8242) loss 6.9300 (6.9096) grad_norm 2.3699 (2.6080) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:53:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][320/625] eta 0:03:58 lr 0.000036 wd 0.0500 time 0.5648 (0.7823) data time 0.0008 (0.0161) model time 0.5640 (0.7662) loss 6.9161 (6.8194) grad_norm 3.3485 (2.6377) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:53:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][330/625] eta 0:03:41 lr 0.000036 wd 0.0500 time 0.7856 (0.7493) data time 0.0006 (0.0133) model time 0.7850 (0.7359) loss 5.7343 (6.7457) grad_norm 2.4069 (2.6765) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:53:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][340/625] eta 0:03:25 lr 0.000036 wd 0.0500 time 0.5654 (0.7209) data time 0.0008 (0.0114) model time 0.5646 (0.7095) loss 7.5122 (6.7464) grad_norm 2.9099 (2.7862) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:53:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][350/625] eta 0:03:12 lr 0.000036 wd 0.0500 time 0.5624 (0.7002) data time 0.0009 (0.0100) model time 0.5615 (0.6902) loss 6.5507 (6.7139) grad_norm 2.8183 (2.8182) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:53:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][360/625] eta 0:03:01 lr 0.000036 wd 0.0500 time 0.5653 (0.6844) data time 0.0006 (0.0089) model time 0.5647 (0.6755) loss 6.6308 (6.6969) grad_norm 2.4849 (2.7717) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:53:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][370/625] eta 0:02:51 lr 0.000036 wd 0.0500 time 0.5667 (0.6720) data time 0.0009 (0.0081) model time 0.5658 (0.6639) loss 7.4228 (6.6949) grad_norm 2.5750 (2.7960) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:53:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][380/625] eta 0:02:42 lr 0.000036 wd 0.0500 time 0.5678 (0.6621) data time 0.0008 (0.0074) model time 0.5670 (0.6547) loss 6.7406 (6.7345) grad_norm 2.2229 (2.8874) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:54:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][390/625] eta 0:02:33 lr 0.000036 wd 0.0500 time 0.5601 (0.6539) data time 0.0007 (0.0068) model time 0.5594 (0.6471) loss 5.7593 (6.7274) grad_norm 2.5093 (2.8453) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:54:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][400/625] eta 0:02:25 lr 0.000036 wd 0.0500 time 0.5666 (0.6471) data time 0.0006 (0.0063) model time 0.5659 (0.6407) loss 6.2777 (6.7433) grad_norm 5.4584 (2.8471) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:54:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][410/625] eta 0:02:17 lr 0.000036 wd 0.0500 time 0.5676 (0.6413) data time 0.0007 (0.0059) model time 0.5669 (0.6353) loss 6.3585 (6.7553) grad_norm 1.9797 (2.8046) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:54:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][420/625] eta 0:02:10 lr 0.000036 wd 0.0500 time 0.5671 (0.6362) data time 0.0009 (0.0056) model time 0.5662 (0.6306) loss 6.9029 (6.7460) grad_norm 2.8993 (2.7737) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:54:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][430/625] eta 0:02:03 lr 0.000036 wd 0.0500 time 0.5657 (0.6318) data time 0.0009 (0.0053) model time 0.5648 (0.6266) loss 7.5857 (6.7553) grad_norm 3.6689 (2.7607) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:54:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][440/625] eta 0:01:56 lr 0.000036 wd 0.0500 time 0.5663 (0.6288) data time 0.0009 (0.0050) model time 0.5655 (0.6237) loss 7.9511 (6.7533) grad_norm 6.8553 (2.7606) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:54:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][450/625] eta 0:01:49 lr 0.000036 wd 0.0500 time 0.5709 (0.6257) data time 0.0007 (0.0048) model time 0.5702 (0.6209) loss 7.0959 (6.7616) grad_norm 2.4122 (2.7516) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:54:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][460/625] eta 0:01:42 lr 0.000036 wd 0.0500 time 0.5724 (0.6227) data time 0.0009 (0.0046) model time 0.5716 (0.6181) loss 6.1212 (6.7445) grad_norm 2.6467 (2.8201) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:54:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][470/625] eta 0:01:36 lr 0.000036 wd 0.0500 time 0.5714 (0.6200) data time 0.0010 (0.0044) model time 0.5705 (0.6156) loss 6.2772 (6.7312) grad_norm 1.7258 (2.8037) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:54:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][480/625] eta 0:01:29 lr 0.000036 wd 0.0500 time 0.5704 (0.6179) data time 0.0008 (0.0042) model time 0.5695 (0.6137) loss 6.7040 (6.7269) grad_norm 2.7234 (2.7864) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:55:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][490/625] eta 0:01:23 lr 0.000036 wd 0.0500 time 0.5690 (0.6157) data time 0.0009 (0.0041) model time 0.5681 (0.6117) loss 6.8476 (6.7157) grad_norm 2.6531 (2.7678) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:55:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][500/625] eta 0:01:16 lr 0.000036 wd 0.0500 time 0.5694 (0.6137) data time 0.0007 (0.0039) model time 0.5687 (0.6098) loss 6.1368 (6.7152) grad_norm 3.2312 (2.7485) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:55:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][510/625] eta 0:01:10 lr 0.000036 wd 0.0500 time 0.5680 (0.6130) data time 0.0009 (0.0038) model time 0.5671 (0.6092) loss 7.4511 (6.7140) grad_norm 2.0668 (2.7312) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:55:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][520/625] eta 0:01:04 lr 0.000036 wd 0.0500 time 0.5675 (0.6112) data time 0.0009 (0.0037) model time 0.5666 (0.6075) loss 6.3914 (6.7157) grad_norm 2.9037 (2.7263) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:55:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][530/625] eta 0:00:57 lr 0.000035 wd 0.0500 time 0.5705 (0.6104) data time 0.0006 (0.0036) model time 0.5698 (0.6068) loss 6.1081 (6.7056) grad_norm 2.9651 (2.7129) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:55:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][540/625] eta 0:00:51 lr 0.000035 wd 0.0500 time 0.5700 (0.6094) data time 0.0006 (0.0035) model time 0.5694 (0.6060) loss 6.2178 (6.6979) grad_norm 2.6497 (2.7045) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:55:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][550/625] eta 0:00:45 lr 0.000035 wd 0.0500 time 0.5700 (0.6089) data time 0.0009 (0.0034) model time 0.5691 (0.6056) loss 7.6835 (6.6933) grad_norm 2.5887 (2.8673) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:55:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][560/625] eta 0:00:39 lr 0.000035 wd 0.0500 time 0.5707 (0.6081) data time 0.0009 (0.0033) model time 0.5698 (0.6049) loss 8.0139 (6.6991) grad_norm 2.4500 (2.8656) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:55:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][570/625] eta 0:00:33 lr 0.000035 wd 0.0500 time 0.5685 (0.6070) data time 0.0009 (0.0032) model time 0.5676 (0.6038) loss 6.3661 (6.6940) grad_norm 3.9126 (2.8617) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 06:55:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 06:55:51 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 06:55:57 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 06:58:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 06:58:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 06:58:27 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 06:58:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 06:58:44 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 06:58:45 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 06:58:45 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 06:58:45 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 274) +[2024-07-29 06:58:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 06:59:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][580/625] eta 0:02:06 lr 0.000035 wd 0.0500 time 0.5849 (2.8165) data time 0.0008 (0.1494) model time 0.5841 (2.6671) loss 7.0374 (6.8610) grad_norm 2.6671 (3.3935) loss_scale 256.0000 (256.0000) mem 22343MB +[2024-07-29 06:59:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][590/625] eta 0:00:46 lr 0.000035 wd 0.0500 time 0.5896 (1.3340) data time 0.0010 (0.0505) model time 0.5886 (1.2834) loss 7.2813 (6.8112) grad_norm 2.5253 (2.8971) loss_scale 256.0000 (256.0000) mem 22343MB +[2024-07-29 06:59:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][600/625] eta 0:00:25 lr 0.000035 wd 0.0500 time 0.5835 (1.0364) data time 0.0010 (0.0307) model time 0.5825 (1.0056) loss 8.0384 (6.8648) grad_norm 1.8980 (2.8252) loss_scale 256.0000 (256.0000) mem 22343MB +[2024-07-29 06:59:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][610/625] eta 0:00:13 lr 0.000035 wd 0.0500 time 0.5859 (0.9077) data time 0.0008 (0.0223) model time 0.5851 (0.8854) loss 7.2001 (6.8892) grad_norm 2.9819 (2.7687) loss_scale 256.0000 (256.0000) mem 22343MB +[2024-07-29 06:59:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [274/300][620/625] eta 0:00:04 lr 0.000035 wd 0.0500 time 0.5858 (0.8363) data time 0.0008 (0.0175) model time 0.5851 (0.8187) loss 8.0847 (6.8621) grad_norm 2.4333 (2.6616) loss_scale 256.0000 (256.0000) mem 22343MB +[2024-07-29 06:59:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 274 training takes 0:00:39 +[2024-07-29 06:59:30 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 06:59:34 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 06:59:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.523 (0.523) Loss 0.4861 (0.4861) Acc@1 90.283 (90.283) Acc@5 99.072 (99.072) Mem 22343MB +[2024-07-29 06:59:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.162) Loss 0.7344 (0.5910) Acc@1 83.301 (88.179) Acc@5 97.266 (98.189) Mem 22343MB +[2024-07-29 06:59:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.145) Loss 0.8125 (0.6763) Acc@1 81.396 (85.649) Acc@5 96.289 (97.440) Mem 22343MB +[2024-07-29 06:59:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.307 Acc@5 97.431 +[2024-07-29 06:59:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.3% +[2024-07-29 06:59:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 85.31% +[2024-07-29 06:59:40 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-29 06:59:41 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-29 06:59:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.509 (0.509) Loss 0.4946 (0.4946) Acc@1 90.332 (90.332) Acc@5 99.023 (99.023) Mem 22343MB +[2024-07-29 06:59:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.160) Loss 0.7344 (0.5985) Acc@1 83.350 (88.250) Acc@5 97.119 (98.167) Mem 22343MB +[2024-07-29 06:59:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.144) Loss 0.8184 (0.6827) Acc@1 81.348 (85.624) Acc@5 96.387 (97.398) Mem 22343MB +[2024-07-29 06:59:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.231 Acc@5 97.403 +[2024-07-29 06:59:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.2% +[2024-07-29 06:59:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.23% +[2024-07-29 06:59:45 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 06:59:46 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 06:59:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][0/625] eta 0:14:39 lr 0.000035 wd 0.0500 time 1.4076 (1.4076) data time 0.3873 (0.3873) model time 0.0000 (0.0000) loss 6.3768 (6.3768) grad_norm 1.8668 (1.8668) loss_scale 256.0000 (256.0000) mem 22337MB +[2024-07-29 06:59:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][10/625] eta 0:07:01 lr 0.000035 wd 0.0500 time 0.5904 (0.6846) data time 0.0008 (0.0361) model time 0.0000 (0.0000) loss 6.2400 (6.4520) grad_norm 1.8338 (2.5462) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:00:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][20/625] eta 0:06:27 lr 0.000035 wd 0.0500 time 0.5893 (0.6398) data time 0.0009 (0.0194) model time 0.0000 (0.0000) loss 5.4705 (6.5501) grad_norm 1.9824 (2.6976) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:00:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][30/625] eta 0:06:10 lr 0.000035 wd 0.0500 time 0.5911 (0.6222) data time 0.0011 (0.0135) model time 0.0000 (0.0000) loss 6.8174 (6.6437) grad_norm 10.1401 (2.8902) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:00:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][40/625] eta 0:05:59 lr 0.000035 wd 0.0500 time 0.6017 (0.6137) data time 0.0008 (0.0106) model time 0.0000 (0.0000) loss 6.5043 (6.5893) grad_norm 1.5358 (2.9271) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:00:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][50/625] eta 0:05:50 lr 0.000035 wd 0.0500 time 0.5915 (0.6090) data time 0.0010 (0.0087) model time 0.0000 (0.0000) loss 7.8158 (6.6234) grad_norm 2.0762 (2.8617) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:00:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][60/625] eta 0:05:42 lr 0.000035 wd 0.0500 time 0.5853 (0.6062) data time 0.0011 (0.0075) model time 0.5842 (0.5909) loss 5.4170 (6.6096) grad_norm 2.8592 (2.8142) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:00:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][70/625] eta 0:05:35 lr 0.000035 wd 0.0500 time 0.5990 (0.6040) data time 0.0008 (0.0066) model time 0.5982 (0.5902) loss 7.4940 (6.6447) grad_norm 2.9903 (2.7554) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:00:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][80/625] eta 0:05:27 lr 0.000035 wd 0.0500 time 0.5748 (0.6013) data time 0.0008 (0.0059) model time 0.5740 (0.5872) loss 6.6554 (6.6066) grad_norm 1.7702 (2.7145) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:00:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][90/625] eta 0:05:20 lr 0.000035 wd 0.0500 time 0.5810 (0.5989) data time 0.0008 (0.0053) model time 0.5802 (0.5849) loss 5.3992 (6.6142) grad_norm 1.8628 (2.6768) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:00:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][100/625] eta 0:05:13 lr 0.000035 wd 0.0500 time 0.5801 (0.5968) data time 0.0010 (0.0049) model time 0.5791 (0.5833) loss 7.3832 (6.6411) grad_norm 3.0765 (2.7676) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:00:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][110/625] eta 0:05:06 lr 0.000035 wd 0.0500 time 0.5797 (0.5951) data time 0.0010 (0.0046) model time 0.5787 (0.5823) loss 6.7530 (6.6596) grad_norm 2.4846 (2.7721) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:00:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][120/625] eta 0:04:59 lr 0.000035 wd 0.0500 time 0.5776 (0.5939) data time 0.0008 (0.0043) model time 0.5768 (0.5818) loss 5.4793 (6.6802) grad_norm 1.8501 (2.7432) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:01:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][130/625] eta 0:04:53 lr 0.000035 wd 0.0500 time 0.5862 (0.5928) data time 0.0008 (0.0041) model time 0.5854 (0.5813) loss 6.3943 (6.6518) grad_norm 2.8740 (2.7429) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:01:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][140/625] eta 0:04:47 lr 0.000035 wd 0.0500 time 0.5755 (0.5928) data time 0.0008 (0.0039) model time 0.5747 (0.5825) loss 6.0577 (6.6504) grad_norm 2.3224 (2.7427) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:01:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][150/625] eta 0:04:41 lr 0.000035 wd 0.0500 time 0.5986 (0.5922) data time 0.0009 (0.0037) model time 0.5977 (0.5825) loss 7.2640 (6.6241) grad_norm 1.9425 (2.7368) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:01:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][160/625] eta 0:04:35 lr 0.000035 wd 0.0500 time 0.5846 (0.5916) data time 0.0010 (0.0035) model time 0.5836 (0.5823) loss 6.6340 (6.6158) grad_norm 1.9394 (2.7226) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:01:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][170/625] eta 0:04:28 lr 0.000035 wd 0.0500 time 0.5805 (0.5909) data time 0.0012 (0.0034) model time 0.5793 (0.5821) loss 7.4953 (6.6017) grad_norm 3.4865 (2.7864) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:01:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][180/625] eta 0:04:22 lr 0.000035 wd 0.0500 time 0.5825 (0.5904) data time 0.0011 (0.0032) model time 0.5814 (0.5820) loss 7.9330 (6.6125) grad_norm 4.6457 (2.7991) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:01:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][190/625] eta 0:04:16 lr 0.000035 wd 0.0500 time 0.5794 (0.5900) data time 0.0011 (0.0031) model time 0.5782 (0.5819) loss 8.1239 (6.6135) grad_norm 2.6180 (2.7917) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:01:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][200/625] eta 0:04:10 lr 0.000035 wd 0.0500 time 0.5771 (0.5895) data time 0.0010 (0.0030) model time 0.5761 (0.5818) loss 5.9201 (6.6215) grad_norm 2.7543 (2.7929) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:01:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][210/625] eta 0:04:04 lr 0.000035 wd 0.0500 time 0.5787 (0.5891) data time 0.0011 (0.0029) model time 0.5776 (0.5817) loss 6.5947 (6.6161) grad_norm 2.4585 (2.9035) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:01:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][220/625] eta 0:03:58 lr 0.000035 wd 0.0500 time 0.5861 (0.5889) data time 0.0008 (0.0028) model time 0.5853 (0.5818) loss 7.6776 (6.6120) grad_norm 2.2891 (2.8726) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:02:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][230/625] eta 0:03:52 lr 0.000035 wd 0.0500 time 0.5820 (0.5896) data time 0.0011 (0.0028) model time 0.5809 (0.5830) loss 6.8171 (6.6218) grad_norm 3.4821 (2.8694) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:02:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][240/625] eta 0:03:46 lr 0.000035 wd 0.0500 time 0.5880 (0.5893) data time 0.0012 (0.0027) model time 0.5868 (0.5829) loss 6.1192 (6.6279) grad_norm 3.3676 (2.8712) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:02:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][250/625] eta 0:03:40 lr 0.000034 wd 0.0500 time 0.5802 (0.5890) data time 0.0008 (0.0026) model time 0.5794 (0.5828) loss 6.0345 (6.6163) grad_norm 1.9638 (2.9473) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:02:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][260/625] eta 0:03:34 lr 0.000034 wd 0.0500 time 0.5835 (0.5887) data time 0.0010 (0.0026) model time 0.5825 (0.5827) loss 6.1213 (6.6090) grad_norm 2.3868 (2.9466) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:02:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][270/625] eta 0:03:28 lr 0.000034 wd 0.0500 time 0.5833 (0.5885) data time 0.0011 (0.0025) model time 0.5822 (0.5827) loss 6.3703 (6.6285) grad_norm 2.2524 (2.9245) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:02:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][280/625] eta 0:03:22 lr 0.000034 wd 0.0500 time 0.5816 (0.5883) data time 0.0008 (0.0025) model time 0.5808 (0.5826) loss 7.0195 (6.6313) grad_norm 1.7584 (2.9335) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:02:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][290/625] eta 0:03:16 lr 0.000034 wd 0.0500 time 0.5794 (0.5880) data time 0.0010 (0.0024) model time 0.5784 (0.5825) loss 7.0126 (6.6303) grad_norm 2.2087 (2.9133) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:02:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][300/625] eta 0:03:11 lr 0.000034 wd 0.0500 time 0.5929 (0.5879) data time 0.0013 (0.0024) model time 0.5915 (0.5825) loss 6.1214 (6.6387) grad_norm 2.1436 (2.8939) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:02:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][310/625] eta 0:03:05 lr 0.000034 wd 0.0500 time 0.5842 (0.5878) data time 0.0009 (0.0023) model time 0.5834 (0.5825) loss 7.6313 (6.6406) grad_norm 2.3226 (2.8707) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:02:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][320/625] eta 0:02:59 lr 0.000034 wd 0.0500 time 0.5803 (0.5875) data time 0.0009 (0.0023) model time 0.5794 (0.5824) loss 7.8199 (6.6393) grad_norm 2.5482 (2.8686) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:03:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][330/625] eta 0:02:53 lr 0.000034 wd 0.0500 time 0.5820 (0.5874) data time 0.0009 (0.0023) model time 0.5811 (0.5823) loss 5.9777 (6.6382) grad_norm 3.2733 (2.8749) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:03:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][340/625] eta 0:02:47 lr 0.000034 wd 0.0500 time 0.5795 (0.5871) data time 0.0011 (0.0022) model time 0.5784 (0.5822) loss 5.9622 (6.6228) grad_norm 2.0095 (2.8704) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:03:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][350/625] eta 0:02:41 lr 0.000034 wd 0.0500 time 0.5743 (0.5870) data time 0.0010 (0.0022) model time 0.5733 (0.5821) loss 6.3380 (6.6240) grad_norm 2.6242 (2.8931) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:03:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][360/625] eta 0:02:35 lr 0.000034 wd 0.0500 time 0.5793 (0.5871) data time 0.0008 (0.0022) model time 0.5785 (0.5825) loss 6.2199 (6.6266) grad_norm 3.6634 (2.8969) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:03:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][370/625] eta 0:02:29 lr 0.000034 wd 0.0500 time 0.5830 (0.5870) data time 0.0007 (0.0021) model time 0.5823 (0.5824) loss 7.0134 (6.6291) grad_norm 1.9331 (2.8758) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:03:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][380/625] eta 0:02:23 lr 0.000034 wd 0.0500 time 0.5830 (0.5868) data time 0.0012 (0.0021) model time 0.5818 (0.5823) loss 7.8179 (6.6373) grad_norm 2.9390 (2.8763) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:03:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][390/625] eta 0:02:17 lr 0.000034 wd 0.0500 time 0.5848 (0.5867) data time 0.0011 (0.0021) model time 0.5837 (0.5823) loss 6.2534 (6.6439) grad_norm 6.1879 (2.8762) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:03:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][400/625] eta 0:02:11 lr 0.000034 wd 0.0500 time 0.5781 (0.5866) data time 0.0007 (0.0020) model time 0.5774 (0.5822) loss 6.3415 (6.6410) grad_norm 2.4401 (2.8652) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:03:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][410/625] eta 0:02:06 lr 0.000034 wd 0.0500 time 0.5837 (0.5865) data time 0.0008 (0.0020) model time 0.5830 (0.5822) loss 7.3924 (6.6322) grad_norm 2.1461 (2.8540) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:04:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][420/625] eta 0:02:06 lr 0.000034 wd 0.0500 time 0.8911 (0.6172) data time 0.0011 (0.0020) model time 0.8900 (0.6172) loss 6.7191 (6.6279) grad_norm 2.2495 (2.8445) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:04:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][430/625] eta 0:02:03 lr 0.000034 wd 0.0500 time 2.8374 (0.6342) data time 0.0010 (0.0020) model time 2.8364 (0.6366) loss 6.0268 (6.6278) grad_norm 7.0955 (2.8448) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:04:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][440/625] eta 0:02:00 lr 0.000034 wd 0.0500 time 0.5778 (0.6538) data time 0.0008 (0.0020) model time 0.5770 (0.6586) loss 6.0683 (6.6305) grad_norm 1.6438 (2.8431) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:04:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][450/625] eta 0:01:56 lr 0.000034 wd 0.0500 time 1.5306 (0.6667) data time 0.0008 (0.0019) model time 1.5298 (0.6730) loss 5.6754 (6.6284) grad_norm 2.0565 (2.8410) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:05:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][460/625] eta 0:01:53 lr 0.000034 wd 0.0500 time 0.7977 (0.6854) data time 0.0011 (0.0019) model time 0.7966 (0.6938) loss 7.2130 (6.6373) grad_norm 2.6000 (2.8396) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:05:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][470/625] eta 0:01:48 lr 0.000034 wd 0.0500 time 1.1540 (0.6988) data time 0.0008 (0.0019) model time 1.1532 (0.7087) loss 6.1308 (6.6374) grad_norm 7.6232 (2.8666) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:05:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][480/625] eta 0:01:43 lr 0.000034 wd 0.0500 time 1.4675 (0.7112) data time 0.0009 (0.0019) model time 1.4665 (0.7223) loss 6.1415 (6.6314) grad_norm 2.7115 (2.8640) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:05:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][490/625] eta 0:01:37 lr 0.000034 wd 0.0500 time 1.5258 (0.7252) data time 0.0008 (0.0040) model time 1.5251 (0.7352) loss 6.5371 (6.6323) grad_norm 3.5951 (2.8715) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:05:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][500/625] eta 0:01:32 lr 0.000034 wd 0.0500 time 3.7241 (0.7382) data time 0.0009 (0.0048) model time 3.7232 (0.7485) loss 5.6007 (6.6304) grad_norm 2.7476 (2.8747) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:06:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][510/625] eta 0:01:25 lr 0.000034 wd 0.0500 time 1.0012 (0.7476) data time 0.0010 (0.0047) model time 1.0002 (0.7587) loss 7.0444 (6.6339) grad_norm 3.0039 (2.9359) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:06:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][520/625] eta 0:01:19 lr 0.000034 wd 0.0500 time 0.5714 (0.7587) data time 0.0010 (0.0046) model time 0.5704 (0.7707) loss 6.9831 (6.6388) grad_norm 2.3516 (2.9327) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:06:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][530/625] eta 0:01:13 lr 0.000034 wd 0.0500 time 1.4538 (0.7692) data time 0.0008 (0.0046) model time 1.4530 (0.7822) loss 6.3435 (6.6369) grad_norm 2.8177 (2.9296) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:06:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][540/625] eta 0:01:06 lr 0.000034 wd 0.0500 time 1.9214 (0.7775) data time 0.0009 (0.0045) model time 1.9205 (0.7909) loss 7.1727 (6.6454) grad_norm 1.9107 (2.9244) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:06:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][550/625] eta 0:00:58 lr 0.000034 wd 0.0500 time 1.5958 (0.7862) data time 0.0011 (0.0044) model time 1.5947 (0.8003) loss 6.9608 (6.6443) grad_norm 2.2307 (2.9291) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:07:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][560/625] eta 0:00:51 lr 0.000034 wd 0.0500 time 0.5778 (0.7943) data time 0.0008 (0.0044) model time 0.5770 (0.8089) loss 6.5283 (6.6412) grad_norm 3.5541 (2.9394) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:07:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][570/625] eta 0:00:44 lr 0.000034 wd 0.0500 time 0.7670 (0.8026) data time 0.0010 (0.0043) model time 0.7660 (0.8177) loss 6.8887 (6.6440) grad_norm 2.4656 (2.9306) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:07:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][580/625] eta 0:00:36 lr 0.000034 wd 0.0500 time 1.3529 (0.8127) data time 0.0009 (0.0043) model time 1.3520 (0.8284) loss 6.7473 (6.6430) grad_norm 2.7696 (2.9299) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:07:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][590/625] eta 0:00:28 lr 0.000034 wd 0.0500 time 2.1447 (0.8242) data time 0.0007 (0.0042) model time 2.1440 (0.8407) loss 5.3885 (6.6430) grad_norm 2.1112 (2.9171) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:08:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][600/625] eta 0:00:20 lr 0.000033 wd 0.0500 time 0.7359 (0.8335) data time 0.0011 (0.0041) model time 0.7348 (0.8506) loss 6.5170 (6.6373) grad_norm 2.3266 (2.9687) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:08:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][610/625] eta 0:00:12 lr 0.000033 wd 0.0500 time 2.0574 (0.8458) data time 0.0006 (0.0041) model time 2.0569 (0.8637) loss 6.3438 (6.6365) grad_norm 2.4341 (2.9628) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:08:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [275/300][620/625] eta 0:00:04 lr 0.000033 wd 0.0500 time 0.5784 (0.8479) data time 0.0005 (0.0040) model time 0.5779 (0.8657) loss 7.8374 (6.6407) grad_norm 2.2631 (2.9621) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:08:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 275 training takes 0:08:48 +[2024-07-29 07:08:35 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 07:08:36 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 07:08:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 4.376 (4.376) Loss 0.4937 (0.4937) Acc@1 90.234 (90.234) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-29 07:08:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.127 (0.513) Loss 0.7417 (0.5940) Acc@1 83.105 (88.179) Acc@5 97.217 (98.171) Mem 22339MB +[2024-07-29 07:08:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.127 (0.330) Loss 0.8115 (0.6790) Acc@1 81.299 (85.612) Acc@5 96.338 (97.421) Mem 22339MB +[2024-07-29 07:08:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.249 Acc@5 97.399 +[2024-07-29 07:08:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.2% +[2024-07-29 07:08:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 4.069 (4.069) Loss 0.4941 (0.4941) Acc@1 90.332 (90.332) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-29 07:08:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.128 (0.493) Loss 0.7339 (0.5981) Acc@1 83.301 (88.250) Acc@5 97.119 (98.153) Mem 22339MB +[2024-07-29 07:08:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.127 (0.319) Loss 0.8184 (0.6822) Acc@1 81.299 (85.633) Acc@5 96.387 (97.396) Mem 22339MB +[2024-07-29 07:08:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.243 Acc@5 97.395 +[2024-07-29 07:08:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.2% +[2024-07-29 07:08:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.24% +[2024-07-29 07:08:51 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 07:08:56 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 07:09:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][0/625] eta 3:38:52 lr 0.000033 wd 0.0500 time 21.0121 (21.0121) data time 13.5736 (13.5736) model time 0.0000 (0.0000) loss 7.1929 (7.1929) grad_norm 2.3285 (2.3285) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:09:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][10/625] eta 0:24:56 lr 0.000033 wd 0.0500 time 0.5736 (2.4338) data time 0.0010 (1.2354) model time 0.0000 (0.0000) loss 7.1321 (6.4954) grad_norm 2.3353 (2.3034) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:09:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][20/625] eta 0:19:21 lr 0.000033 wd 0.0500 time 2.8560 (1.9201) data time 0.0010 (0.6476) model time 0.0000 (0.0000) loss 6.1107 (6.5153) grad_norm 2.7477 (2.3972) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:09:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][30/625] eta 0:16:45 lr 0.000033 wd 0.0500 time 1.0940 (1.6900) data time 0.0007 (0.4390) model time 0.0000 (0.0000) loss 7.6255 (6.5290) grad_norm 2.3932 (2.8639) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:10:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][40/625] eta 0:15:56 lr 0.000033 wd 0.0500 time 1.4756 (1.6356) data time 0.0010 (0.3322) model time 0.0000 (0.0000) loss 7.1128 (6.4536) grad_norm 1.8851 (2.7334) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:10:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 07:10:17 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 07:10:18 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 07:13:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 07:14:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 07:14:24 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 07:14:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 07:14:34 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 07:14:35 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 07:14:35 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 07:14:35 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 276) +[2024-07-29 07:14:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 07:14:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][50/625] eta 0:52:09 lr 0.000033 wd 0.0500 time 1.7234 (5.4425) data time 0.0008 (0.3471) model time 0.0000 (0.0000) loss 7.8186 (7.5615) grad_norm 2.5732 (2.6990) loss_scale 256.0000 (256.0000) mem 22342MB +[2024-07-29 07:14:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][60/625] eta 0:13:01 lr 0.000033 wd 0.0500 time 0.5709 (1.3826) data time 0.0006 (0.0587) model time 0.5702 (0.5697) loss 5.9022 (6.7447) grad_norm 2.5807 (3.5676) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 07:15:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][70/625] eta 0:09:22 lr 0.000033 wd 0.0500 time 0.5685 (1.0137) data time 0.0008 (0.0324) model time 0.5677 (0.5698) loss 7.1277 (6.8633) grad_norm 2.0067 (3.5639) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 07:15:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][80/625] eta 0:07:56 lr 0.000033 wd 0.0500 time 0.5726 (0.8751) data time 0.0006 (0.0226) model time 0.5719 (0.5696) loss 6.6850 (6.9052) grad_norm 2.0450 (3.1992) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 07:15:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][90/625] eta 0:07:09 lr 0.000033 wd 0.0500 time 0.5716 (0.8027) data time 0.0009 (0.0174) model time 0.5707 (0.5697) loss 7.8152 (6.8447) grad_norm 2.1556 (3.0379) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 07:15:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][100/625] eta 0:06:39 lr 0.000033 wd 0.0500 time 0.5229 (0.7605) data time 0.0007 (0.0142) model time 0.5223 (0.5723) loss 6.4272 (6.8028) grad_norm 2.2216 (2.9445) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 07:15:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][110/625] eta 0:06:17 lr 0.000033 wd 0.0500 time 0.5914 (0.7337) data time 0.0006 (0.0121) model time 0.5908 (0.5758) loss 7.8149 (6.7651) grad_norm 3.2182 (3.0076) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 07:15:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][120/625] eta 0:05:59 lr 0.000033 wd 0.0500 time 0.5735 (0.7114) data time 0.0008 (0.0105) model time 0.5726 (0.5753) loss 6.9381 (6.7132) grad_norm 2.4839 (2.9761) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 07:15:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 07:15:34 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 07:15:40 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 07:17:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 07:17:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 07:18:06 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 07:18:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 07:18:21 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 07:18:22 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 07:18:22 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 07:18:22 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 276) +[2024-07-29 07:18:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 07:18:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][130/625] eta 0:34:46 lr 0.000033 wd 0.0500 time 0.5886 (4.2153) data time 0.0008 (0.2587) model time 0.5877 (3.9566) loss 6.6487 (7.3932) grad_norm 2.0514 (3.1265) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 07:18:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][140/625] eta 0:11:31 lr 0.000033 wd 0.0500 time 0.5863 (1.4261) data time 0.0010 (0.0606) model time 0.5852 (1.3655) loss 6.6682 (7.0485) grad_norm 2.2589 (2.5914) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 07:18:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][150/625] eta 0:08:24 lr 0.000033 wd 0.0500 time 0.5908 (1.0627) data time 0.0008 (0.0348) model time 0.5899 (1.0280) loss 6.8249 (7.0395) grad_norm 2.2450 (2.4933) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 07:18:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][160/625] eta 0:07:08 lr 0.000033 wd 0.0500 time 0.5950 (0.9206) data time 0.0009 (0.0245) model time 0.5941 (0.8961) loss 7.6341 (7.0043) grad_norm 2.3200 (2.7835) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 07:19:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 07:19:01 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 07:19:05 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 07:22:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 07:22:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 07:24:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 07:24:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 07:24:38 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 07:24:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 07:24:55 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 07:24:56 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 07:24:56 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 07:24:56 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 276) +[2024-07-29 07:24:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 07:25:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][170/625] eta 0:28:28 lr 0.000033 wd 0.0500 time 0.5826 (3.7555) data time 0.0008 (0.1763) model time 0.5818 (3.5792) loss 7.4844 (6.9792) grad_norm 2.5900 (2.6367) loss_scale 256.0000 (256.0000) mem 22345MB +[2024-07-29 07:25:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][180/625] eta 0:11:02 lr 0.000033 wd 0.0500 time 0.5831 (1.4890) data time 0.0008 (0.0511) model time 0.5823 (1.4379) loss 6.1316 (6.8967) grad_norm 2.3107 (3.0911) loss_scale 256.0000 (256.0000) mem 22345MB +[2024-07-29 07:25:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][190/625] eta 0:08:03 lr 0.000033 wd 0.0500 time 0.5800 (1.1105) data time 0.0011 (0.0303) model time 0.5790 (1.0802) loss 6.5796 (6.8675) grad_norm 1.8495 (2.7900) loss_scale 256.0000 (256.0000) mem 22345MB +[2024-07-29 07:25:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][200/625] eta 0:06:46 lr 0.000033 wd 0.0500 time 0.5850 (0.9554) data time 0.0008 (0.0217) model time 0.5843 (0.9337) loss 5.9427 (6.8296) grad_norm 2.5144 (2.7709) loss_scale 256.0000 (256.0000) mem 22345MB +[2024-07-29 07:25:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][210/625] eta 0:06:01 lr 0.000033 wd 0.0500 time 0.5847 (0.8713) data time 0.0008 (0.0170) model time 0.5839 (0.8543) loss 6.9171 (6.7977) grad_norm 2.9669 (3.2795) loss_scale 256.0000 (256.0000) mem 22345MB +[2024-07-29 07:25:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][220/625] eta 0:05:32 lr 0.000033 wd 0.0500 time 0.5835 (0.8216) data time 0.0008 (0.0140) model time 0.5827 (0.8075) loss 7.1790 (6.8095) grad_norm 2.9455 (3.2701) loss_scale 256.0000 (256.0000) mem 22345MB +[2024-07-29 07:25:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][230/625] eta 0:05:11 lr 0.000033 wd 0.0500 time 0.5893 (0.7886) data time 0.0008 (0.0120) model time 0.5886 (0.7766) loss 6.6509 (6.7319) grad_norm 3.0722 (3.1308) loss_scale 256.0000 (256.0000) mem 22345MB +[2024-07-29 07:25:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][240/625] eta 0:04:53 lr 0.000033 wd 0.0500 time 0.5880 (0.7614) data time 0.0008 (0.0105) model time 0.5872 (0.7509) loss 6.6098 (6.6954) grad_norm 3.6659 (3.0552) loss_scale 256.0000 (256.0000) mem 22345MB +[2024-07-29 07:26:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][250/625] eta 0:04:37 lr 0.000033 wd 0.0500 time 0.5840 (0.7405) data time 0.0010 (0.0094) model time 0.5830 (0.7311) loss 6.4764 (6.6679) grad_norm 2.0565 (2.9809) loss_scale 256.0000 (256.0000) mem 22345MB +[2024-07-29 07:26:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][260/625] eta 0:04:24 lr 0.000033 wd 0.0500 time 0.5876 (0.7241) data time 0.0010 (0.0085) model time 0.5865 (0.7156) loss 6.4211 (6.6508) grad_norm 2.4583 (2.9121) loss_scale 256.0000 (256.0000) mem 22345MB +[2024-07-29 07:26:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][270/625] eta 0:04:12 lr 0.000033 wd 0.0500 time 0.5842 (0.7109) data time 0.0011 (0.0078) model time 0.5831 (0.7031) loss 6.0775 (6.6869) grad_norm 1.8146 (2.8573) loss_scale 256.0000 (256.0000) mem 22345MB +[2024-07-29 07:26:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][280/625] eta 0:04:01 lr 0.000033 wd 0.0500 time 0.5868 (0.6999) data time 0.0012 (0.0072) model time 0.5857 (0.6928) loss 8.2734 (6.6992) grad_norm 2.1274 (2.8122) loss_scale 256.0000 (256.0000) mem 22345MB +[2024-07-29 07:26:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][290/625] eta 0:03:51 lr 0.000033 wd 0.0500 time 0.5841 (0.6908) data time 0.0010 (0.0067) model time 0.5830 (0.6841) loss 6.1136 (6.6993) grad_norm 7.3701 (2.8546) loss_scale 256.0000 (256.0000) mem 22345MB +[2024-07-29 07:26:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][300/625] eta 0:03:41 lr 0.000033 wd 0.0500 time 0.5872 (0.6830) data time 0.0010 (0.0063) model time 0.5861 (0.6768) loss 7.3546 (6.7063) grad_norm 1.9685 (2.8121) loss_scale 256.0000 (256.0000) mem 22345MB +[2024-07-29 07:26:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][310/625] eta 0:03:33 lr 0.000033 wd 0.0500 time 0.5880 (0.6764) data time 0.0008 (0.0059) model time 0.5873 (0.6705) loss 6.5981 (6.7001) grad_norm 1.7239 (2.7738) loss_scale 256.0000 (256.0000) mem 22345MB +[2024-07-29 07:26:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][320/625] eta 0:03:24 lr 0.000033 wd 0.0500 time 0.5950 (0.6706) data time 0.0007 (0.0056) model time 0.5943 (0.6650) loss 6.6353 (6.6897) grad_norm 2.2286 (2.7475) loss_scale 256.0000 (256.0000) mem 22345MB +[2024-07-29 07:26:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][330/625] eta 0:03:16 lr 0.000032 wd 0.0500 time 0.5861 (0.6655) data time 0.0008 (0.0053) model time 0.5853 (0.6602) loss 6.7416 (6.6875) grad_norm 3.3787 (2.7604) loss_scale 256.0000 (256.0000) mem 22345MB +[2024-07-29 07:26:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][340/625] eta 0:03:08 lr 0.000032 wd 0.0500 time 0.5907 (0.6608) data time 0.0010 (0.0051) model time 0.5897 (0.6558) loss 5.8650 (6.7029) grad_norm 2.1039 (inf) loss_scale 128.0000 (249.3793) mem 22345MB +[2024-07-29 07:27:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][350/625] eta 0:03:00 lr 0.000032 wd 0.0500 time 0.5884 (0.6568) data time 0.0009 (0.0048) model time 0.5876 (0.6520) loss 6.8709 (6.7060) grad_norm 2.6652 (inf) loss_scale 128.0000 (242.7826) mem 22345MB +[2024-07-29 07:27:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][360/625] eta 0:02:53 lr 0.000032 wd 0.0500 time 0.5835 (0.6531) data time 0.0007 (0.0046) model time 0.5828 (0.6485) loss 5.7845 (6.7094) grad_norm 2.5888 (inf) loss_scale 128.0000 (236.8660) mem 22345MB +[2024-07-29 07:27:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][370/625] eta 0:02:45 lr 0.000032 wd 0.0500 time 0.5904 (0.6498) data time 0.0010 (0.0045) model time 0.5893 (0.6453) loss 6.9271 (6.7037) grad_norm 3.2234 (inf) loss_scale 128.0000 (231.5294) mem 22345MB +[2024-07-29 07:27:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][380/625] eta 0:02:38 lr 0.000032 wd 0.0500 time 0.5952 (0.6469) data time 0.0008 (0.0043) model time 0.5944 (0.6426) loss 6.7269 (6.6900) grad_norm 2.2057 (inf) loss_scale 128.0000 (226.6916) mem 22345MB +[2024-07-29 07:27:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][390/625] eta 0:02:31 lr 0.000032 wd 0.0500 time 0.5913 (0.6443) data time 0.0012 (0.0042) model time 0.5900 (0.6401) loss 8.0525 (6.6869) grad_norm 2.1728 (inf) loss_scale 128.0000 (222.2857) mem 22345MB +[2024-07-29 07:27:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][400/625] eta 0:02:24 lr 0.000032 wd 0.0500 time 0.5958 (0.6419) data time 0.0008 (0.0040) model time 0.5951 (0.6379) loss 6.0737 (6.6851) grad_norm 2.2199 (inf) loss_scale 128.0000 (218.2564) mem 22345MB +[2024-07-29 07:27:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][410/625] eta 0:02:17 lr 0.000032 wd 0.0500 time 0.5882 (0.6398) data time 0.0008 (0.0039) model time 0.5874 (0.6359) loss 5.8992 (6.6984) grad_norm 9.2812 (inf) loss_scale 128.0000 (214.5574) mem 22345MB +[2024-07-29 07:27:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][420/625] eta 0:02:10 lr 0.000032 wd 0.0500 time 0.5888 (0.6378) data time 0.0008 (0.0038) model time 0.5880 (0.6340) loss 5.5831 (6.6808) grad_norm 2.5932 (inf) loss_scale 128.0000 (211.1496) mem 22345MB +[2024-07-29 07:27:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][430/625] eta 0:02:04 lr 0.000032 wd 0.0500 time 0.5893 (0.6359) data time 0.0009 (0.0037) model time 0.5885 (0.6322) loss 5.9711 (6.6671) grad_norm 2.6082 (inf) loss_scale 128.0000 (208.0000) mem 22345MB +[2024-07-29 07:27:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][440/625] eta 0:01:57 lr 0.000032 wd 0.0500 time 0.8052 (0.6352) data time 0.0010 (0.0036) model time 0.8042 (0.6316) loss 7.1235 (6.6633) grad_norm 2.0749 (inf) loss_scale 128.0000 (205.0803) mem 22345MB +[2024-07-29 07:28:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][450/625] eta 0:01:50 lr 0.000032 wd 0.0500 time 0.5893 (0.6335) data time 0.0009 (0.0035) model time 0.5884 (0.6300) loss 5.9754 (6.6649) grad_norm 4.6551 (inf) loss_scale 128.0000 (202.3662) mem 22345MB +[2024-07-29 07:28:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][460/625] eta 0:01:44 lr 0.000032 wd 0.0500 time 0.5896 (0.6320) data time 0.0008 (0.0034) model time 0.5888 (0.6285) loss 5.4291 (6.6493) grad_norm 2.4053 (inf) loss_scale 128.0000 (199.8367) mem 22345MB +[2024-07-29 07:28:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][470/625] eta 0:01:37 lr 0.000032 wd 0.0500 time 0.5890 (0.6305) data time 0.0010 (0.0034) model time 0.5880 (0.6272) loss 7.7091 (6.6431) grad_norm 1.9925 (inf) loss_scale 128.0000 (197.4737) mem 22345MB +[2024-07-29 07:28:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][480/625] eta 0:01:31 lr 0.000032 wd 0.0500 time 0.5884 (0.6292) data time 0.0012 (0.0033) model time 0.5872 (0.6259) loss 7.9929 (6.6453) grad_norm 2.4180 (inf) loss_scale 128.0000 (195.2611) mem 22345MB +[2024-07-29 07:28:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][490/625] eta 0:01:24 lr 0.000032 wd 0.0500 time 0.5893 (0.6279) data time 0.0008 (0.0032) model time 0.5885 (0.6247) loss 6.7621 (6.6577) grad_norm 2.0239 (inf) loss_scale 128.0000 (193.1852) mem 22345MB +[2024-07-29 07:28:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][500/625] eta 0:01:18 lr 0.000032 wd 0.0500 time 0.5868 (0.6267) data time 0.0010 (0.0032) model time 0.5857 (0.6236) loss 5.8851 (6.6571) grad_norm 2.8259 (inf) loss_scale 128.0000 (191.2335) mem 22345MB +[2024-07-29 07:28:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][510/625] eta 0:01:11 lr 0.000032 wd 0.0500 time 0.5854 (0.6256) data time 0.0010 (0.0031) model time 0.5844 (0.6225) loss 7.5288 (6.6645) grad_norm 2.6203 (inf) loss_scale 128.0000 (189.3953) mem 22345MB +[2024-07-29 07:28:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][520/625] eta 0:01:05 lr 0.000032 wd 0.0500 time 0.5788 (0.6245) data time 0.0010 (0.0030) model time 0.5778 (0.6214) loss 6.8434 (6.6674) grad_norm 2.7803 (inf) loss_scale 128.0000 (187.6610) mem 22345MB +[2024-07-29 07:28:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][530/625] eta 0:00:59 lr 0.000032 wd 0.0500 time 0.5889 (0.6234) data time 0.0008 (0.0030) model time 0.5881 (0.6205) loss 5.4705 (6.6593) grad_norm 2.2499 (inf) loss_scale 128.0000 (186.0220) mem 22345MB +[2024-07-29 07:28:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][540/625] eta 0:00:52 lr 0.000032 wd 0.0500 time 0.5855 (0.6225) data time 0.0011 (0.0029) model time 0.5844 (0.6195) loss 8.0280 (6.6567) grad_norm 3.2987 (inf) loss_scale 128.0000 (184.4706) mem 22345MB +[2024-07-29 07:28:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][550/625] eta 0:00:46 lr 0.000032 wd 0.0500 time 0.5915 (0.6216) data time 0.0010 (0.0029) model time 0.5906 (0.6187) loss 6.1474 (6.6490) grad_norm 2.4108 (inf) loss_scale 128.0000 (183.0000) mem 22345MB +[2024-07-29 07:29:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][560/625] eta 0:00:40 lr 0.000032 wd 0.0500 time 0.5883 (0.6208) data time 0.0010 (0.0028) model time 0.5873 (0.6179) loss 6.6137 (6.6451) grad_norm 2.0441 (inf) loss_scale 128.0000 (181.6041) mem 22345MB +[2024-07-29 07:29:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][570/625] eta 0:00:34 lr 0.000032 wd 0.0500 time 0.6042 (0.6200) data time 0.0010 (0.0028) model time 0.6032 (0.6172) loss 6.7846 (6.6477) grad_norm 2.2197 (inf) loss_scale 128.0000 (180.2772) mem 22345MB +[2024-07-29 07:29:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][580/625] eta 0:00:27 lr 0.000032 wd 0.0500 time 0.5868 (0.6192) data time 0.0011 (0.0027) model time 0.5857 (0.6164) loss 5.3411 (6.6480) grad_norm 2.2080 (inf) loss_scale 128.0000 (179.0145) mem 22345MB +[2024-07-29 07:29:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][590/625] eta 0:00:21 lr 0.000032 wd 0.0500 time 0.5887 (0.6185) data time 0.0008 (0.0027) model time 0.5879 (0.6158) loss 6.8298 (6.6491) grad_norm 2.3159 (inf) loss_scale 128.0000 (177.8113) mem 22345MB +[2024-07-29 07:29:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][600/625] eta 0:00:15 lr 0.000032 wd 0.0500 time 0.5871 (0.6178) data time 0.0010 (0.0027) model time 0.5860 (0.6151) loss 8.4566 (6.6576) grad_norm 2.4832 (inf) loss_scale 128.0000 (176.6636) mem 22345MB +[2024-07-29 07:29:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][610/625] eta 0:00:09 lr 0.000032 wd 0.0500 time 0.5884 (0.6171) data time 0.0007 (0.0026) model time 0.5876 (0.6144) loss 6.6827 (6.6518) grad_norm 2.0784 (inf) loss_scale 128.0000 (175.5676) mem 22345MB +[2024-07-29 07:29:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [276/300][620/625] eta 0:00:03 lr 0.000032 wd 0.0500 time 0.5899 (0.6164) data time 0.0005 (0.0026) model time 0.5893 (0.6139) loss 7.2089 (6.6460) grad_norm 2.0508 (inf) loss_scale 128.0000 (174.5198) mem 22345MB +[2024-07-29 07:29:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 276 training takes 0:04:42 +[2024-07-29 07:29:43 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 07:29:49 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 07:29:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.529 (0.529) Loss 0.4907 (0.4907) Acc@1 90.381 (90.381) Acc@5 98.975 (98.975) Mem 22345MB +[2024-07-29 07:29:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.162) Loss 0.7358 (0.5917) Acc@1 83.057 (88.219) Acc@5 97.021 (98.171) Mem 22345MB +[2024-07-29 07:29:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.145) Loss 0.8120 (0.6766) Acc@1 80.957 (85.656) Acc@5 96.289 (97.421) Mem 22345MB +[2024-07-29 07:29:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.271 Acc@5 97.405 +[2024-07-29 07:29:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.3% +[2024-07-29 07:29:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.853 (0.853) Loss 0.4941 (0.4941) Acc@1 90.332 (90.332) Acc@5 99.023 (99.023) Mem 22345MB +[2024-07-29 07:29:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.192) Loss 0.7339 (0.5980) Acc@1 83.350 (88.255) Acc@5 97.168 (98.158) Mem 22345MB +[2024-07-29 07:29:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.160) Loss 0.8174 (0.6820) Acc@1 81.152 (85.626) Acc@5 96.387 (97.398) Mem 22345MB +[2024-07-29 07:29:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.235 Acc@5 97.395 +[2024-07-29 07:29:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.2% +[2024-07-29 07:29:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.23% +[2024-07-29 07:29:59 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 07:30:03 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 07:30:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][0/625] eta 0:23:27 lr 0.000032 wd 0.0500 time 2.2518 (2.2518) data time 0.4615 (0.4615) model time 0.0000 (0.0000) loss 6.0984 (6.0984) grad_norm 2.9342 (2.9342) loss_scale 128.0000 (128.0000) mem 22337MB +[2024-07-29 07:30:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][10/625] eta 0:07:29 lr 0.000032 wd 0.0500 time 0.5736 (0.7306) data time 0.0008 (0.0428) model time 0.0000 (0.0000) loss 5.5292 (6.5393) grad_norm 2.0797 (2.5246) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:30:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][20/625] eta 0:06:37 lr 0.000032 wd 0.0500 time 0.5755 (0.6565) data time 0.0010 (0.0229) model time 0.0000 (0.0000) loss 7.6086 (6.6298) grad_norm 75.7561 (6.1365) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:30:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][30/625] eta 0:06:15 lr 0.000032 wd 0.0500 time 0.5797 (0.6305) data time 0.0011 (0.0158) model time 0.0000 (0.0000) loss 5.6062 (6.6383) grad_norm 5.2217 (5.3259) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:30:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][40/625] eta 0:06:03 lr 0.000032 wd 0.0500 time 0.5734 (0.6220) data time 0.0008 (0.0122) model time 0.0000 (0.0000) loss 6.3863 (6.6040) grad_norm 2.1311 (4.6449) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:30:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][50/625] eta 0:05:52 lr 0.000032 wd 0.0500 time 0.5776 (0.6131) data time 0.0008 (0.0101) model time 0.0000 (0.0000) loss 7.4735 (6.6451) grad_norm 2.4084 (4.2521) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:30:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][60/625] eta 0:05:42 lr 0.000032 wd 0.0500 time 0.5737 (0.6069) data time 0.0010 (0.0086) model time 0.5728 (0.5743) loss 5.4679 (6.6668) grad_norm 2.1040 (3.9907) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:30:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][70/625] eta 0:05:34 lr 0.000031 wd 0.0500 time 0.5767 (0.6024) data time 0.0007 (0.0075) model time 0.5760 (0.5743) loss 7.1594 (6.6481) grad_norm 1.8874 (3.8225) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:30:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][80/625] eta 0:05:26 lr 0.000031 wd 0.0500 time 0.5776 (0.5992) data time 0.0011 (0.0067) model time 0.5765 (0.5745) loss 6.7651 (6.6617) grad_norm 1.8653 (3.8353) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:30:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][90/625] eta 0:05:19 lr 0.000031 wd 0.0500 time 0.5780 (0.5968) data time 0.0010 (0.0060) model time 0.5770 (0.5750) loss 7.4076 (6.6789) grad_norm 2.1684 (3.6740) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:31:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][100/625] eta 0:05:12 lr 0.000031 wd 0.0500 time 0.5751 (0.5945) data time 0.0009 (0.0056) model time 0.5742 (0.5746) loss 6.6462 (6.6878) grad_norm 6.0281 (3.6196) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:31:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][110/625] eta 0:05:05 lr 0.000031 wd 0.0500 time 0.5790 (0.5928) data time 0.0011 (0.0052) model time 0.5779 (0.5746) loss 6.0714 (6.7032) grad_norm 1.8660 (3.5200) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:31:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][120/625] eta 0:04:58 lr 0.000031 wd 0.0500 time 0.5806 (0.5915) data time 0.0008 (0.0048) model time 0.5798 (0.5747) loss 7.4976 (6.7217) grad_norm 2.4127 (3.4526) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:31:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][130/625] eta 0:04:52 lr 0.000031 wd 0.0500 time 0.5728 (0.5902) data time 0.0011 (0.0045) model time 0.5717 (0.5746) loss 5.6706 (6.7055) grad_norm 2.1751 (3.3790) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:31:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][140/625] eta 0:04:45 lr 0.000031 wd 0.0500 time 0.5775 (0.5893) data time 0.0010 (0.0043) model time 0.5765 (0.5748) loss 6.4378 (6.7126) grad_norm 2.8326 (3.3161) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:31:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][150/625] eta 0:04:39 lr 0.000031 wd 0.0500 time 0.5751 (0.5885) data time 0.0008 (0.0041) model time 0.5743 (0.5750) loss 6.1034 (6.7003) grad_norm 2.2209 (3.3381) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:31:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][160/625] eta 0:04:33 lr 0.000031 wd 0.0500 time 0.5768 (0.5877) data time 0.0008 (0.0039) model time 0.5760 (0.5749) loss 6.7311 (6.7103) grad_norm 2.0163 (3.3263) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:31:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][170/625] eta 0:04:27 lr 0.000031 wd 0.0500 time 0.5745 (0.5871) data time 0.0011 (0.0037) model time 0.5734 (0.5750) loss 6.0011 (6.6967) grad_norm 2.1268 (3.4677) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:31:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][180/625] eta 0:04:21 lr 0.000031 wd 0.0500 time 0.5765 (0.5866) data time 0.0007 (0.0036) model time 0.5758 (0.5751) loss 7.2864 (6.7027) grad_norm 2.4878 (3.4201) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:31:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][190/625] eta 0:04:14 lr 0.000031 wd 0.0500 time 0.5764 (0.5860) data time 0.0008 (0.0034) model time 0.5756 (0.5752) loss 6.4037 (6.6780) grad_norm 2.2160 (3.3858) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:32:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][200/625] eta 0:04:09 lr 0.000031 wd 0.0500 time 0.5764 (0.5860) data time 0.0010 (0.0033) model time 0.5754 (0.5757) loss 6.6359 (6.6640) grad_norm 3.1641 (3.3564) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:32:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][210/625] eta 0:04:03 lr 0.000031 wd 0.0500 time 0.5786 (0.5855) data time 0.0008 (0.0032) model time 0.5779 (0.5757) loss 6.7317 (6.6602) grad_norm 2.9041 (3.3161) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:32:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][220/625] eta 0:03:56 lr 0.000031 wd 0.0500 time 0.5763 (0.5852) data time 0.0009 (0.0031) model time 0.5755 (0.5758) loss 6.3742 (6.6644) grad_norm 2.0823 (3.2833) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:32:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][230/625] eta 0:03:51 lr 0.000031 wd 0.0500 time 0.5806 (0.5849) data time 0.0010 (0.0030) model time 0.5796 (0.5759) loss 6.2320 (6.6576) grad_norm 3.2094 (3.2634) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:32:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][240/625] eta 0:03:45 lr 0.000031 wd 0.0500 time 0.5858 (0.5846) data time 0.0010 (0.0029) model time 0.5848 (0.5759) loss 5.7252 (6.6538) grad_norm 2.8981 (3.2760) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:32:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][250/625] eta 0:03:39 lr 0.000031 wd 0.0500 time 0.5738 (0.5842) data time 0.0010 (0.0029) model time 0.5728 (0.5758) loss 5.2775 (6.6492) grad_norm 2.0475 (3.2614) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:32:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][260/625] eta 0:03:33 lr 0.000031 wd 0.0500 time 0.5751 (0.5847) data time 0.0007 (0.0028) model time 0.5744 (0.5768) loss 7.5457 (6.6441) grad_norm 5.3660 (3.2546) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:32:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][270/625] eta 0:03:27 lr 0.000031 wd 0.0500 time 0.5761 (0.5844) data time 0.0007 (0.0027) model time 0.5753 (0.5767) loss 7.5010 (6.6427) grad_norm 2.0639 (3.2216) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:32:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][280/625] eta 0:03:21 lr 0.000031 wd 0.0500 time 0.5839 (0.5843) data time 0.0010 (0.0027) model time 0.5828 (0.5768) loss 6.8599 (6.6616) grad_norm 2.2677 (3.1864) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:32:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][290/625] eta 0:03:15 lr 0.000031 wd 0.0500 time 0.5751 (0.5840) data time 0.0010 (0.0026) model time 0.5741 (0.5768) loss 6.7841 (6.6527) grad_norm 2.7884 (3.1659) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:32:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][300/625] eta 0:03:09 lr 0.000031 wd 0.0500 time 0.5758 (0.5838) data time 0.0010 (0.0026) model time 0.5748 (0.5767) loss 7.4328 (6.6487) grad_norm 2.0408 (3.1436) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:33:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][310/625] eta 0:03:03 lr 0.000031 wd 0.0500 time 0.5756 (0.5835) data time 0.0007 (0.0025) model time 0.5749 (0.5767) loss 6.0784 (6.6488) grad_norm 2.0645 (3.1312) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:33:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][320/625] eta 0:02:57 lr 0.000031 wd 0.0500 time 0.5756 (0.5832) data time 0.0008 (0.0025) model time 0.5749 (0.5766) loss 5.2746 (6.6549) grad_norm 2.2211 (3.1477) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:33:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][330/625] eta 0:02:51 lr 0.000031 wd 0.0500 time 0.5756 (0.5830) data time 0.0010 (0.0024) model time 0.5746 (0.5765) loss 6.3022 (6.6535) grad_norm 12.8630 (3.1764) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:33:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][340/625] eta 0:02:46 lr 0.000031 wd 0.0500 time 0.5792 (0.5827) data time 0.0010 (0.0024) model time 0.5782 (0.5764) loss 6.8343 (6.6564) grad_norm 2.3112 (3.1537) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:33:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][350/625] eta 0:02:40 lr 0.000031 wd 0.0500 time 0.5764 (0.5825) data time 0.0007 (0.0023) model time 0.5757 (0.5763) loss 5.5593 (6.6512) grad_norm 5.0194 (3.1392) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:33:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][360/625] eta 0:02:34 lr 0.000031 wd 0.0500 time 0.5796 (0.5824) data time 0.0010 (0.0023) model time 0.5786 (0.5763) loss 6.8313 (6.6576) grad_norm 1.9764 (3.1751) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:33:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][370/625] eta 0:02:28 lr 0.000031 wd 0.0500 time 0.5777 (0.5822) data time 0.0009 (0.0023) model time 0.5768 (0.5763) loss 6.3498 (6.6534) grad_norm 2.3684 (3.1577) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:33:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][380/625] eta 0:02:22 lr 0.000031 wd 0.0500 time 0.5761 (0.5821) data time 0.0008 (0.0022) model time 0.5753 (0.5762) loss 6.8962 (6.6455) grad_norm 1.9940 (3.1389) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:33:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][390/625] eta 0:02:16 lr 0.000031 wd 0.0500 time 0.5763 (0.5819) data time 0.0010 (0.0022) model time 0.5753 (0.5762) loss 6.6474 (6.6369) grad_norm 2.7811 (3.1208) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:33:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][400/625] eta 0:02:10 lr 0.000031 wd 0.0500 time 0.5766 (0.5818) data time 0.0008 (0.0022) model time 0.5758 (0.5763) loss 6.1425 (6.6404) grad_norm 1.8733 (3.1565) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:34:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][410/625] eta 0:02:05 lr 0.000031 wd 0.0500 time 0.5761 (0.5817) data time 0.0012 (0.0021) model time 0.5750 (0.5763) loss 6.5523 (6.6430) grad_norm 2.0908 (3.1500) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:34:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][420/625] eta 0:01:59 lr 0.000031 wd 0.0500 time 0.5728 (0.5820) data time 0.0008 (0.0021) model time 0.5720 (0.5767) loss 5.8316 (6.6350) grad_norm 2.6413 (3.1334) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:34:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][430/625] eta 0:01:53 lr 0.000031 wd 0.0500 time 0.5853 (0.5819) data time 0.0007 (0.0021) model time 0.5846 (0.5767) loss 6.7776 (6.6345) grad_norm 5.7673 (3.2025) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:34:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][440/625] eta 0:01:47 lr 0.000030 wd 0.0500 time 0.5834 (0.5818) data time 0.0010 (0.0021) model time 0.5824 (0.5767) loss 6.4830 (6.6275) grad_norm 1.8540 (3.1818) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:34:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][450/625] eta 0:01:41 lr 0.000030 wd 0.0500 time 0.5766 (0.5817) data time 0.0011 (0.0020) model time 0.5755 (0.5767) loss 7.6459 (6.6353) grad_norm 2.3203 (3.1704) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:34:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][460/625] eta 0:01:35 lr 0.000030 wd 0.0500 time 0.5784 (0.5816) data time 0.0008 (0.0020) model time 0.5776 (0.5766) loss 7.1811 (6.6283) grad_norm 2.6765 (3.1543) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:34:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][470/625] eta 0:01:30 lr 0.000030 wd 0.0500 time 0.5764 (0.5815) data time 0.0008 (0.0020) model time 0.5756 (0.5766) loss 7.5024 (6.6362) grad_norm 3.3409 (3.1483) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:34:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][480/625] eta 0:01:24 lr 0.000030 wd 0.0500 time 0.5727 (0.5817) data time 0.0011 (0.0020) model time 0.5716 (0.5769) loss 6.8164 (6.6316) grad_norm 3.4517 (3.1395) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:34:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][490/625] eta 0:01:18 lr 0.000030 wd 0.0500 time 0.5761 (0.5816) data time 0.0010 (0.0020) model time 0.5751 (0.5769) loss 6.3192 (6.6259) grad_norm 1.8262 (3.1246) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:34:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][500/625] eta 0:01:12 lr 0.000030 wd 0.0500 time 0.5759 (0.5815) data time 0.0008 (0.0019) model time 0.5750 (0.5769) loss 5.5107 (6.6238) grad_norm 3.6427 (3.1663) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:35:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][510/625] eta 0:01:06 lr 0.000030 wd 0.0500 time 0.5770 (0.5813) data time 0.0010 (0.0019) model time 0.5760 (0.5768) loss 7.4547 (6.6308) grad_norm 2.3211 (3.1620) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:35:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][520/625] eta 0:01:01 lr 0.000030 wd 0.0500 time 0.5739 (0.5812) data time 0.0011 (0.0019) model time 0.5728 (0.5768) loss 5.9742 (6.6285) grad_norm 7.3950 (3.1680) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:35:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][530/625] eta 0:00:55 lr 0.000030 wd 0.0500 time 0.5747 (0.5811) data time 0.0008 (0.0019) model time 0.5739 (0.5767) loss 7.6984 (6.6311) grad_norm 2.9424 (3.1628) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:35:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][540/625] eta 0:00:49 lr 0.000030 wd 0.0500 time 0.5747 (0.5810) data time 0.0011 (0.0019) model time 0.5737 (0.5767) loss 7.3355 (6.6343) grad_norm 2.2092 (3.1531) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:35:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][550/625] eta 0:00:43 lr 0.000030 wd 0.0500 time 0.5738 (0.5809) data time 0.0008 (0.0019) model time 0.5731 (0.5766) loss 6.5870 (6.6311) grad_norm 3.7941 (3.1445) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:35:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][560/625] eta 0:00:37 lr 0.000030 wd 0.0500 time 0.5739 (0.5808) data time 0.0007 (0.0018) model time 0.5731 (0.5766) loss 5.6306 (6.6290) grad_norm 2.8290 (3.1368) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:35:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][570/625] eta 0:00:31 lr 0.000030 wd 0.0500 time 0.5748 (0.5807) data time 0.0010 (0.0018) model time 0.5738 (0.5765) loss 6.8609 (6.6188) grad_norm 2.1278 (3.1222) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:35:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][580/625] eta 0:00:26 lr 0.000030 wd 0.0500 time 0.5762 (0.5807) data time 0.0010 (0.0018) model time 0.5752 (0.5765) loss 7.3650 (6.6195) grad_norm 1.8020 (3.1099) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:35:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][590/625] eta 0:00:20 lr 0.000030 wd 0.0500 time 0.5779 (0.5806) data time 0.0008 (0.0018) model time 0.5771 (0.5765) loss 7.0014 (6.6175) grad_norm 2.6829 (3.0986) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:35:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][600/625] eta 0:00:14 lr 0.000030 wd 0.0500 time 0.5755 (0.5806) data time 0.0010 (0.0018) model time 0.5745 (0.5766) loss 6.3481 (6.6192) grad_norm 2.5960 (3.0878) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:35:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][610/625] eta 0:00:08 lr 0.000030 wd 0.0500 time 0.5767 (0.5806) data time 0.0008 (0.0018) model time 0.5759 (0.5766) loss 5.3927 (6.6127) grad_norm 2.6337 (3.0807) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:36:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [277/300][620/625] eta 0:00:02 lr 0.000030 wd 0.0500 time 0.5743 (0.5805) data time 0.0005 (0.0018) model time 0.5738 (0.5765) loss 7.5301 (6.6168) grad_norm 3.1730 (3.0792) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:36:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 277 training takes 0:06:02 +[2024-07-29 07:36:06 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 07:36:11 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 07:36:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.565 (0.565) Loss 0.4878 (0.4878) Acc@1 90.479 (90.479) Acc@5 99.023 (99.023) Mem 22338MB +[2024-07-29 07:36:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.166) Loss 0.7324 (0.5898) Acc@1 82.910 (88.175) Acc@5 97.168 (98.216) Mem 22338MB +[2024-07-29 07:36:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.147) Loss 0.7998 (0.6741) Acc@1 81.006 (85.568) Acc@5 96.533 (97.477) Mem 22338MB +[2024-07-29 07:36:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.235 Acc@5 97.445 +[2024-07-29 07:36:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.2% +[2024-07-29 07:36:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 1.062 (1.062) Loss 0.4937 (0.4937) Acc@1 90.381 (90.381) Acc@5 99.023 (99.023) Mem 22338MB +[2024-07-29 07:36:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.211) Loss 0.7339 (0.5974) Acc@1 83.301 (88.268) Acc@5 97.119 (98.158) Mem 22338MB +[2024-07-29 07:36:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.171) Loss 0.8174 (0.6815) Acc@1 81.104 (85.624) Acc@5 96.387 (97.403) Mem 22338MB +[2024-07-29 07:36:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.233 Acc@5 97.399 +[2024-07-29 07:36:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.2% +[2024-07-29 07:36:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][0/625] eta 0:18:42 lr 0.000030 wd 0.0500 time 1.7953 (1.7953) data time 1.1133 (1.1133) model time 0.0000 (0.0000) loss 6.2581 (6.2581) grad_norm 2.0268 (2.0268) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:36:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][10/625] eta 0:07:02 lr 0.000030 wd 0.0500 time 0.5748 (0.6864) data time 0.0011 (0.1022) model time 0.0000 (0.0000) loss 5.9551 (6.4951) grad_norm 1.9857 (2.8317) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:36:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][20/625] eta 0:06:23 lr 0.000030 wd 0.0500 time 0.5966 (0.6345) data time 0.0010 (0.0540) model time 0.0000 (0.0000) loss 7.2919 (6.5436) grad_norm 1.8714 (2.6369) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:36:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][30/625] eta 0:06:06 lr 0.000030 wd 0.0500 time 0.5800 (0.6158) data time 0.0010 (0.0369) model time 0.0000 (0.0000) loss 6.5394 (6.5942) grad_norm 2.3120 (2.6102) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:36:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][40/625] eta 0:05:54 lr 0.000030 wd 0.0500 time 0.5759 (0.6061) data time 0.0010 (0.0282) model time 0.0000 (0.0000) loss 5.9203 (6.6662) grad_norm 2.8017 (2.6670) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:36:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][50/625] eta 0:05:45 lr 0.000030 wd 0.0500 time 0.5736 (0.6001) data time 0.0008 (0.0229) model time 0.0000 (0.0000) loss 6.5786 (6.6445) grad_norm 2.5522 (2.8319) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:36:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][60/625] eta 0:05:36 lr 0.000030 wd 0.0500 time 0.5740 (0.5961) data time 0.0010 (0.0193) model time 0.5730 (0.5746) loss 7.8195 (6.7060) grad_norm 2.6932 (2.8251) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:37:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][70/625] eta 0:05:29 lr 0.000030 wd 0.0500 time 0.5753 (0.5931) data time 0.0008 (0.0167) model time 0.5745 (0.5739) loss 5.7117 (6.6369) grad_norm 3.2067 (2.7852) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:37:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][80/625] eta 0:05:23 lr 0.000030 wd 0.0500 time 0.5749 (0.5929) data time 0.0008 (0.0148) model time 0.5740 (0.5794) loss 5.4596 (6.6119) grad_norm 2.7801 (2.7802) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:37:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][90/625] eta 0:05:16 lr 0.000030 wd 0.0500 time 0.5865 (0.5912) data time 0.0008 (0.0133) model time 0.5856 (0.5787) loss 7.2363 (6.6530) grad_norm 2.6099 (2.7913) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:37:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][100/625] eta 0:05:09 lr 0.000030 wd 0.0500 time 0.5765 (0.5897) data time 0.0010 (0.0121) model time 0.5755 (0.5779) loss 7.4737 (6.7015) grad_norm 2.1184 (2.8136) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:37:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][110/625] eta 0:05:03 lr 0.000030 wd 0.0500 time 0.5776 (0.5886) data time 0.0011 (0.0111) model time 0.5764 (0.5776) loss 6.9178 (6.7046) grad_norm 1.8266 (2.9640) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:37:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][120/625] eta 0:04:56 lr 0.000030 wd 0.0500 time 0.5801 (0.5877) data time 0.0011 (0.0103) model time 0.5790 (0.5775) loss 6.0827 (6.6858) grad_norm 2.6981 (2.9268) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:37:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][130/625] eta 0:04:50 lr 0.000030 wd 0.0500 time 0.5824 (0.5869) data time 0.0010 (0.0096) model time 0.5814 (0.5774) loss 5.9228 (6.6636) grad_norm 4.4257 (3.0144) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:37:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][140/625] eta 0:04:44 lr 0.000030 wd 0.0500 time 0.5775 (0.5861) data time 0.0010 (0.0089) model time 0.5764 (0.5771) loss 7.4462 (6.6765) grad_norm 3.3337 (2.9999) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:37:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][150/625] eta 0:04:38 lr 0.000030 wd 0.0500 time 0.5747 (0.5854) data time 0.0010 (0.0084) model time 0.5737 (0.5769) loss 5.5506 (6.6661) grad_norm 2.5652 (2.9937) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:37:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][160/625] eta 0:04:31 lr 0.000030 wd 0.0500 time 0.5775 (0.5849) data time 0.0010 (0.0080) model time 0.5765 (0.5767) loss 6.3107 (6.6742) grad_norm 2.8708 (3.0641) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:37:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][170/625] eta 0:04:25 lr 0.000030 wd 0.0500 time 0.5751 (0.5846) data time 0.0011 (0.0076) model time 0.5740 (0.5769) loss 6.9121 (6.6550) grad_norm 2.2078 (3.0291) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:38:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][180/625] eta 0:04:19 lr 0.000030 wd 0.0500 time 0.5907 (0.5842) data time 0.0011 (0.0072) model time 0.5896 (0.5769) loss 6.8663 (6.6663) grad_norm 2.5272 (2.9899) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:38:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][190/625] eta 0:04:13 lr 0.000030 wd 0.0500 time 0.5758 (0.5837) data time 0.0009 (0.0069) model time 0.5749 (0.5767) loss 7.6310 (6.6664) grad_norm 5.4752 (2.9829) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:38:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][200/625] eta 0:04:07 lr 0.000029 wd 0.0500 time 0.5778 (0.5834) data time 0.0011 (0.0066) model time 0.5767 (0.5767) loss 6.9321 (6.6792) grad_norm 1.9420 (2.9511) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:38:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][210/625] eta 0:04:02 lr 0.000029 wd 0.0500 time 0.5767 (0.5836) data time 0.0008 (0.0063) model time 0.5759 (0.5773) loss 7.8327 (6.6870) grad_norm 2.0163 (2.9252) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:38:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][220/625] eta 0:03:56 lr 0.000029 wd 0.0500 time 0.5780 (0.5834) data time 0.0009 (0.0061) model time 0.5771 (0.5773) loss 6.8587 (6.6906) grad_norm 3.1285 (2.9351) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:38:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][230/625] eta 0:03:50 lr 0.000029 wd 0.0500 time 0.6057 (0.5834) data time 0.0008 (0.0059) model time 0.6049 (0.5776) loss 7.0973 (6.6760) grad_norm 2.5591 (2.9263) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:38:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][240/625] eta 0:03:44 lr 0.000029 wd 0.0500 time 0.5776 (0.5831) data time 0.0008 (0.0057) model time 0.5768 (0.5775) loss 5.9151 (6.6582) grad_norm 2.2963 (2.9167) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:38:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][250/625] eta 0:03:38 lr 0.000029 wd 0.0500 time 0.5761 (0.5830) data time 0.0011 (0.0056) model time 0.5751 (0.5775) loss 7.0977 (6.6676) grad_norm 7.3571 (2.9212) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:38:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][260/625] eta 0:03:32 lr 0.000029 wd 0.0500 time 0.5771 (0.5828) data time 0.0011 (0.0054) model time 0.5760 (0.5774) loss 7.1024 (6.6690) grad_norm 2.1381 (2.9171) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:38:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][270/625] eta 0:03:26 lr 0.000029 wd 0.0500 time 0.5728 (0.5825) data time 0.0011 (0.0053) model time 0.5717 (0.5773) loss 6.2486 (6.6737) grad_norm 2.0790 (2.9084) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:39:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][280/625] eta 0:03:21 lr 0.000029 wd 0.0500 time 0.5730 (0.5834) data time 0.0010 (0.0051) model time 0.5720 (0.5785) loss 6.7682 (6.6885) grad_norm 2.4131 (2.8926) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:39:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][290/625] eta 0:03:15 lr 0.000029 wd 0.0500 time 0.6045 (0.5833) data time 0.0010 (0.0050) model time 0.6035 (0.5785) loss 7.3791 (6.7005) grad_norm 2.5474 (2.9105) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:39:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][300/625] eta 0:03:09 lr 0.000029 wd 0.0500 time 0.5763 (0.5839) data time 0.0011 (0.0049) model time 0.5753 (0.5793) loss 5.7315 (6.6894) grad_norm 2.8379 (2.9022) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:39:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][310/625] eta 0:03:03 lr 0.000029 wd 0.0500 time 0.5752 (0.5838) data time 0.0009 (0.0048) model time 0.5743 (0.5794) loss 8.2199 (6.6830) grad_norm 2.2322 (2.8924) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:39:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][320/625] eta 0:02:58 lr 0.000029 wd 0.0500 time 0.5748 (0.5837) data time 0.0008 (0.0047) model time 0.5740 (0.5793) loss 6.0986 (6.6895) grad_norm 2.2783 (2.9096) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:39:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][330/625] eta 0:02:52 lr 0.000029 wd 0.0500 time 0.5781 (0.5836) data time 0.0009 (0.0046) model time 0.5771 (0.5793) loss 6.6453 (6.6950) grad_norm 2.5884 (2.8989) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:39:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][340/625] eta 0:02:46 lr 0.000029 wd 0.0500 time 0.5764 (0.5836) data time 0.0011 (0.0045) model time 0.5753 (0.5794) loss 7.9026 (6.6957) grad_norm 2.3468 (2.9063) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:39:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][350/625] eta 0:02:40 lr 0.000029 wd 0.0500 time 0.5854 (0.5835) data time 0.0008 (0.0044) model time 0.5846 (0.5795) loss 6.0012 (6.6889) grad_norm 2.2150 (2.9006) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:39:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][360/625] eta 0:02:34 lr 0.000029 wd 0.0500 time 0.6649 (0.5836) data time 0.0009 (0.0043) model time 0.6641 (0.5797) loss 7.0250 (6.6917) grad_norm 3.2181 (2.8852) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:39:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][370/625] eta 0:02:28 lr 0.000029 wd 0.0500 time 0.5760 (0.5836) data time 0.0010 (0.0042) model time 0.5750 (0.5797) loss 7.1903 (6.6843) grad_norm 2.8386 (2.8843) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:40:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][380/625] eta 0:02:22 lr 0.000029 wd 0.0500 time 0.5782 (0.5835) data time 0.0008 (0.0042) model time 0.5775 (0.5796) loss 7.4611 (6.6959) grad_norm 2.9224 (2.8933) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:40:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][390/625] eta 0:02:17 lr 0.000029 wd 0.0500 time 0.5771 (0.5833) data time 0.0011 (0.0041) model time 0.5760 (0.5795) loss 7.7294 (6.7011) grad_norm 2.1464 (2.8825) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:40:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][400/625] eta 0:02:11 lr 0.000029 wd 0.0500 time 0.5804 (0.5832) data time 0.0008 (0.0040) model time 0.5796 (0.5794) loss 5.3754 (6.6857) grad_norm 16.3279 (2.9104) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:40:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][410/625] eta 0:02:05 lr 0.000029 wd 0.0500 time 0.5712 (0.5829) data time 0.0009 (0.0039) model time 0.5704 (0.5793) loss 6.5954 (6.6750) grad_norm 2.6143 (2.9041) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:40:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][420/625] eta 0:01:59 lr 0.000029 wd 0.0500 time 0.5852 (0.5828) data time 0.0011 (0.0039) model time 0.5841 (0.5792) loss 7.5703 (6.6714) grad_norm 2.6745 (2.8956) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:40:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][430/625] eta 0:01:53 lr 0.000029 wd 0.0500 time 0.5792 (0.5829) data time 0.0010 (0.0038) model time 0.5782 (0.5793) loss 6.2544 (6.6725) grad_norm 3.0013 (2.9043) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:40:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][440/625] eta 0:01:47 lr 0.000029 wd 0.0500 time 0.5747 (0.5827) data time 0.0008 (0.0037) model time 0.5739 (0.5792) loss 7.1330 (6.6804) grad_norm 3.8630 (2.9079) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:40:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][450/625] eta 0:01:41 lr 0.000029 wd 0.0500 time 0.5805 (0.5825) data time 0.0010 (0.0037) model time 0.5796 (0.5791) loss 6.5266 (6.6750) grad_norm 3.7668 (2.9066) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:40:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][460/625] eta 0:01:36 lr 0.000029 wd 0.0500 time 0.5809 (0.5824) data time 0.0009 (0.0036) model time 0.5800 (0.5790) loss 5.7349 (6.6754) grad_norm 3.6242 (2.9102) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:40:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][470/625] eta 0:01:30 lr 0.000029 wd 0.0500 time 0.5783 (0.5823) data time 0.0008 (0.0036) model time 0.5775 (0.5789) loss 6.3972 (6.6713) grad_norm 2.9065 (2.9098) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:40:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][480/625] eta 0:01:24 lr 0.000029 wd 0.0500 time 0.5864 (0.5822) data time 0.0011 (0.0035) model time 0.5853 (0.5789) loss 7.1739 (6.6669) grad_norm 1.9200 (2.9081) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:41:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][490/625] eta 0:01:18 lr 0.000029 wd 0.0500 time 0.5784 (0.5821) data time 0.0010 (0.0035) model time 0.5774 (0.5788) loss 6.7771 (6.6678) grad_norm 2.3318 (2.9046) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:41:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][500/625] eta 0:01:12 lr 0.000029 wd 0.0500 time 0.5743 (0.5820) data time 0.0009 (0.0034) model time 0.5735 (0.5787) loss 6.4683 (6.6643) grad_norm 1.9888 (2.8966) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:41:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][510/625] eta 0:01:06 lr 0.000029 wd 0.0500 time 0.5762 (0.5818) data time 0.0010 (0.0034) model time 0.5752 (0.5786) loss 7.6602 (6.6712) grad_norm 2.4476 (2.8931) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:41:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][520/625] eta 0:01:01 lr 0.000029 wd 0.0500 time 0.5745 (0.5824) data time 0.0008 (0.0033) model time 0.5737 (0.5793) loss 6.4684 (6.6764) grad_norm 2.1890 (2.8898) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:41:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][530/625] eta 0:00:55 lr 0.000029 wd 0.0500 time 0.5735 (0.5822) data time 0.0008 (0.0033) model time 0.5727 (0.5791) loss 7.2738 (6.6803) grad_norm 3.3694 (2.9144) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:41:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][540/625] eta 0:00:49 lr 0.000029 wd 0.0500 time 0.5767 (0.5821) data time 0.0008 (0.0032) model time 0.5759 (0.5790) loss 7.8201 (6.6827) grad_norm 2.0632 (2.9067) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:41:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][550/625] eta 0:00:43 lr 0.000029 wd 0.0500 time 0.5760 (0.5820) data time 0.0008 (0.0032) model time 0.5751 (0.5790) loss 6.8711 (6.6902) grad_norm 2.3260 (2.9021) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:41:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][560/625] eta 0:00:37 lr 0.000029 wd 0.0500 time 0.5780 (0.5819) data time 0.0009 (0.0032) model time 0.5772 (0.5789) loss 5.4133 (6.6857) grad_norm 2.2051 (2.8932) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:41:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][570/625] eta 0:00:31 lr 0.000029 wd 0.0500 time 0.5723 (0.5818) data time 0.0010 (0.0031) model time 0.5712 (0.5788) loss 6.0162 (6.6794) grad_norm 2.4982 (2.8827) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:41:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][580/625] eta 0:00:26 lr 0.000029 wd 0.0500 time 0.5835 (0.5817) data time 0.0008 (0.0031) model time 0.5827 (0.5787) loss 5.5676 (6.6744) grad_norm 3.0674 (2.8884) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:42:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][590/625] eta 0:00:20 lr 0.000028 wd 0.0500 time 0.5747 (0.5816) data time 0.0008 (0.0031) model time 0.5739 (0.5786) loss 5.7403 (6.6732) grad_norm 4.3850 (2.8961) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:42:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][600/625] eta 0:00:14 lr 0.000028 wd 0.0500 time 0.5775 (0.5815) data time 0.0011 (0.0030) model time 0.5765 (0.5785) loss 6.6964 (6.6749) grad_norm 2.1815 (2.8866) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:42:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][610/625] eta 0:00:08 lr 0.000028 wd 0.0500 time 0.5771 (0.5814) data time 0.0005 (0.0030) model time 0.5766 (0.5785) loss 6.8089 (6.6767) grad_norm 2.5900 (2.8915) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:42:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [278/300][620/625] eta 0:00:02 lr 0.000028 wd 0.0500 time 0.5750 (0.5814) data time 0.0005 (0.0030) model time 0.5744 (0.5786) loss 6.3516 (6.6840) grad_norm 2.0283 (2.8909) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:42:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 278 training takes 0:06:03 +[2024-07-29 07:42:22 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 07:42:26 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 07:42:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.543 (0.543) Loss 0.4932 (0.4932) Acc@1 90.479 (90.479) Acc@5 99.023 (99.023) Mem 22338MB +[2024-07-29 07:42:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.164) Loss 0.7363 (0.5913) Acc@1 83.008 (88.210) Acc@5 97.363 (98.184) Mem 22338MB +[2024-07-29 07:42:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.146) Loss 0.8047 (0.6763) Acc@1 81.152 (85.591) Acc@5 96.582 (97.421) Mem 22338MB +[2024-07-29 07:42:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.239 Acc@5 97.415 +[2024-07-29 07:42:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.2% +[2024-07-29 07:42:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.951 (0.951) Loss 0.4932 (0.4932) Acc@1 90.430 (90.430) Acc@5 99.023 (99.023) Mem 22338MB +[2024-07-29 07:42:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.127 (0.203) Loss 0.7334 (0.5970) Acc@1 83.301 (88.263) Acc@5 97.217 (98.162) Mem 22338MB +[2024-07-29 07:42:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.166) Loss 0.8164 (0.6811) Acc@1 81.006 (85.621) Acc@5 96.387 (97.405) Mem 22338MB +[2024-07-29 07:42:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.237 Acc@5 97.401 +[2024-07-29 07:42:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.2% +[2024-07-29 07:42:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.24% +[2024-07-29 07:42:34 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 07:42:37 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 07:42:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][0/625] eta 0:10:43 lr 0.000028 wd 0.0500 time 1.0295 (1.0295) data time 0.4949 (0.4949) model time 0.0000 (0.0000) loss 5.6858 (5.6858) grad_norm 2.3913 (2.3913) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:42:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][10/625] eta 0:06:19 lr 0.000028 wd 0.0500 time 0.5726 (0.6165) data time 0.0008 (0.0458) model time 0.0000 (0.0000) loss 6.8990 (6.6380) grad_norm 2.5352 (2.9949) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:42:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][20/625] eta 0:06:00 lr 0.000028 wd 0.0500 time 0.5718 (0.5963) data time 0.0011 (0.0245) model time 0.0000 (0.0000) loss 6.7897 (6.5343) grad_norm 1.9494 (3.0896) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:42:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][30/625] eta 0:05:50 lr 0.000028 wd 0.0500 time 0.5725 (0.5898) data time 0.0010 (0.0169) model time 0.0000 (0.0000) loss 7.6109 (6.6613) grad_norm 2.6980 (2.8918) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:43:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][40/625] eta 0:05:43 lr 0.000028 wd 0.0500 time 0.5730 (0.5866) data time 0.0007 (0.0130) model time 0.0000 (0.0000) loss 5.7261 (6.6368) grad_norm 2.2061 (2.9095) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:43:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][50/625] eta 0:05:36 lr 0.000028 wd 0.0500 time 0.5747 (0.5852) data time 0.0010 (0.0107) model time 0.0000 (0.0000) loss 7.2231 (6.6233) grad_norm 2.2947 (2.8286) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:43:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][60/625] eta 0:05:30 lr 0.000028 wd 0.0500 time 0.5798 (0.5845) data time 0.0010 (0.0091) model time 0.5787 (0.5799) loss 6.7413 (6.6666) grad_norm 2.6662 (2.7768) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:43:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][70/625] eta 0:05:24 lr 0.000028 wd 0.0500 time 0.5742 (0.5841) data time 0.0011 (0.0080) model time 0.5730 (0.5801) loss 5.9172 (6.6744) grad_norm 2.3321 (2.7202) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:43:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][80/625] eta 0:05:17 lr 0.000028 wd 0.0500 time 0.5744 (0.5831) data time 0.0008 (0.0072) model time 0.5736 (0.5782) loss 6.0765 (6.6571) grad_norm 2.9637 (2.9113) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:43:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][90/625] eta 0:05:11 lr 0.000028 wd 0.0500 time 0.5780 (0.5826) data time 0.0009 (0.0065) model time 0.5770 (0.5780) loss 5.5289 (6.7009) grad_norm 2.0600 (2.9314) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:43:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][100/625] eta 0:05:05 lr 0.000028 wd 0.0500 time 0.5786 (0.5821) data time 0.0010 (0.0060) model time 0.5775 (0.5776) loss 7.2931 (6.6952) grad_norm 2.7925 (2.8830) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:43:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][110/625] eta 0:05:00 lr 0.000028 wd 0.0500 time 0.7362 (0.5833) data time 0.0010 (0.0055) model time 0.7352 (0.5804) loss 5.9508 (6.6581) grad_norm 1.7824 (2.8821) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:43:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][120/625] eta 0:04:55 lr 0.000028 wd 0.0500 time 0.5757 (0.5849) data time 0.0010 (0.0052) model time 0.5747 (0.5834) loss 7.6097 (6.6794) grad_norm 2.6619 (2.8346) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:43:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][130/625] eta 0:04:49 lr 0.000028 wd 0.0500 time 0.5742 (0.5844) data time 0.0010 (0.0049) model time 0.5732 (0.5828) loss 6.7540 (6.6766) grad_norm 1.7211 (2.7984) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:43:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][140/625] eta 0:04:43 lr 0.000028 wd 0.0500 time 0.5722 (0.5839) data time 0.0008 (0.0046) model time 0.5714 (0.5819) loss 7.6878 (6.6718) grad_norm 2.0800 (2.7872) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:44:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][150/625] eta 0:04:37 lr 0.000028 wd 0.0500 time 0.5761 (0.5835) data time 0.0007 (0.0044) model time 0.5754 (0.5815) loss 6.6063 (6.6617) grad_norm 2.2522 (2.7556) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:44:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][160/625] eta 0:04:31 lr 0.000028 wd 0.0500 time 0.5786 (0.5830) data time 0.0008 (0.0042) model time 0.5778 (0.5809) loss 5.7797 (6.6375) grad_norm 3.1744 (2.8105) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:44:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][170/625] eta 0:04:25 lr 0.000028 wd 0.0500 time 0.5771 (0.5827) data time 0.0008 (0.0040) model time 0.5763 (0.5805) loss 6.0931 (6.6510) grad_norm 2.7983 (2.7912) loss_scale 128.0000 (128.0000) mem 22338MB +[2024-07-29 07:44:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 07:44:19 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 07:44:21 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 07:46:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 07:46:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 07:47:00 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 07:47:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 07:47:18 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 07:47:18 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 07:47:19 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 07:47:19 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 279) +[2024-07-29 07:47:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 07:47:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][180/625] eta 0:21:08 lr 0.000028 wd 0.0500 time 0.5211 (2.8507) data time 0.0008 (0.1741) model time 0.5203 (2.6766) loss 6.8357 (6.9621) grad_norm 2.2886 (5.6645) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-29 07:47:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][190/625] eta 0:10:07 lr 0.000028 wd 0.0500 time 0.5563 (1.3966) data time 0.0011 (0.0659) model time 0.5552 (1.3307) loss 7.4444 (6.7410) grad_norm 8.8238 (4.4928) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-29 07:47:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][200/625] eta 0:07:31 lr 0.000028 wd 0.0500 time 0.5174 (1.0620) data time 0.0008 (0.0410) model time 0.5166 (1.0210) loss 6.7894 (6.9091) grad_norm 2.6727 (3.9847) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-29 07:47:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][210/625] eta 0:06:18 lr 0.000028 wd 0.0500 time 0.5183 (0.9122) data time 0.0010 (0.0299) model time 0.5173 (0.8824) loss 7.6797 (6.9079) grad_norm 2.3307 (3.5738) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-29 07:48:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][220/625] eta 0:05:35 lr 0.000028 wd 0.0500 time 0.5200 (0.8288) data time 0.0013 (0.0236) model time 0.5187 (0.8052) loss 6.7104 (6.8377) grad_norm 2.3366 (3.3727) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-29 07:48:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][230/625] eta 0:05:09 lr 0.000028 wd 0.0500 time 0.5557 (0.7830) data time 0.0009 (0.0196) model time 0.5548 (0.7634) loss 8.0288 (6.8414) grad_norm 6.7891 (3.5155) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-29 07:48:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][240/625] eta 0:04:46 lr 0.000028 wd 0.0500 time 0.5689 (0.7435) data time 0.0007 (0.0167) model time 0.5682 (0.7268) loss 6.4192 (6.7889) grad_norm 3.4680 (3.3879) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-29 07:48:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][250/625] eta 0:04:27 lr 0.000028 wd 0.0500 time 0.5192 (0.7143) data time 0.0010 (0.0147) model time 0.5182 (0.6996) loss 7.4316 (6.7801) grad_norm 1.7391 (3.2730) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-29 07:48:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][260/625] eta 0:04:12 lr 0.000028 wd 0.0500 time 0.5159 (0.6922) data time 0.0009 (0.0131) model time 0.5150 (0.6791) loss 5.3024 (6.7462) grad_norm 1.8932 (3.1719) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-29 07:48:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][270/625] eta 0:03:59 lr 0.000028 wd 0.0500 time 0.5187 (0.6753) data time 0.0008 (0.0118) model time 0.5179 (0.6634) loss 6.3535 (6.7583) grad_norm 3.1776 (3.1562) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-29 07:48:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][280/625] eta 0:03:48 lr 0.000028 wd 0.0500 time 0.5181 (0.6610) data time 0.0009 (0.0108) model time 0.5172 (0.6502) loss 7.2167 (6.7791) grad_norm 2.8141 (3.1030) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-29 07:48:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][290/625] eta 0:03:37 lr 0.000028 wd 0.0500 time 0.5168 (0.6494) data time 0.0008 (0.0100) model time 0.5161 (0.6395) loss 7.0701 (6.7713) grad_norm 2.6834 (3.0793) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-29 07:48:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][300/625] eta 0:03:27 lr 0.000028 wd 0.0500 time 0.5181 (0.6391) data time 0.0008 (0.0092) model time 0.5173 (0.6298) loss 5.0538 (6.7621) grad_norm 2.6427 (3.0383) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-29 07:48:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][310/625] eta 0:03:18 lr 0.000028 wd 0.0500 time 0.5158 (0.6308) data time 0.0014 (0.0086) model time 0.5144 (0.6221) loss 5.5811 (6.7688) grad_norm 2.5902 (3.0283) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-29 07:48:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][320/625] eta 0:03:10 lr 0.000028 wd 0.0500 time 0.5119 (0.6236) data time 0.0008 (0.0081) model time 0.5112 (0.6154) loss 5.7380 (6.7410) grad_norm 2.0229 (2.9901) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-29 07:49:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][330/625] eta 0:03:02 lr 0.000028 wd 0.0500 time 0.5132 (0.6176) data time 0.0010 (0.0077) model time 0.5122 (0.6100) loss 6.7422 (6.7455) grad_norm 4.2927 (3.0155) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-29 07:49:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][340/625] eta 0:02:54 lr 0.000028 wd 0.0500 time 0.5230 (0.6118) data time 0.0010 (0.0073) model time 0.5219 (0.6045) loss 6.7583 (6.7396) grad_norm 2.3625 (3.0247) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-29 07:49:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][350/625] eta 0:02:46 lr 0.000028 wd 0.0500 time 0.5177 (0.6069) data time 0.0007 (0.0069) model time 0.5170 (0.6000) loss 6.4501 (6.7338) grad_norm 2.2202 (3.0026) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-29 07:49:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][360/625] eta 0:02:39 lr 0.000028 wd 0.0500 time 0.5153 (0.6023) data time 0.0011 (0.0066) model time 0.5142 (0.5958) loss 5.1151 (6.7231) grad_norm 2.3395 (2.9881) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-29 07:49:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][370/625] eta 0:02:32 lr 0.000028 wd 0.0500 time 0.5167 (0.5986) data time 0.0007 (0.0063) model time 0.5160 (0.5923) loss 6.2998 (6.7168) grad_norm 2.1340 (2.9565) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-29 07:49:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][380/625] eta 0:02:25 lr 0.000027 wd 0.0500 time 0.5161 (0.5949) data time 0.0007 (0.0060) model time 0.5154 (0.5888) loss 5.4762 (6.7078) grad_norm 2.7628 (2.9500) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-29 07:49:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][390/625] eta 0:02:19 lr 0.000027 wd 0.0500 time 0.5182 (0.5915) data time 0.0009 (0.0058) model time 0.5174 (0.5857) loss 5.8259 (6.7003) grad_norm 3.2356 (2.9509) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-29 07:49:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][400/625] eta 0:02:12 lr 0.000027 wd 0.0500 time 0.5118 (0.5885) data time 0.0007 (0.0056) model time 0.5110 (0.5830) loss 6.4316 (6.7023) grad_norm 5.1909 (2.9900) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-29 07:49:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][410/625] eta 0:02:05 lr 0.000027 wd 0.0500 time 0.5171 (0.5859) data time 0.0007 (0.0054) model time 0.5164 (0.5805) loss 6.6350 (6.6948) grad_norm 2.4620 (2.9633) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-29 07:49:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][420/625] eta 0:01:59 lr 0.000027 wd 0.0500 time 0.5180 (0.5834) data time 0.0007 (0.0052) model time 0.5173 (0.5782) loss 5.4866 (6.6971) grad_norm 2.4085 (2.9345) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-29 07:49:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][430/625] eta 0:01:53 lr 0.000027 wd 0.0500 time 0.5630 (0.5814) data time 0.0009 (0.0051) model time 0.5621 (0.5764) loss 5.8945 (6.6832) grad_norm 2.4607 (2.9253) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-29 07:49:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][440/625] eta 0:01:47 lr 0.000027 wd 0.0500 time 0.5666 (0.5793) data time 0.0007 (0.0049) model time 0.5659 (0.5744) loss 6.5331 (6.6732) grad_norm 1.9131 (2.9195) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-29 07:50:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][450/625] eta 0:01:41 lr 0.000027 wd 0.0500 time 0.5166 (0.5780) data time 0.0010 (0.0048) model time 0.5156 (0.5732) loss 7.5819 (6.6725) grad_norm 3.1329 (2.9063) loss_scale 128.0000 (128.0000) mem 22344MB +[2024-07-29 07:50:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][460/625] eta 0:01:35 lr 0.000027 wd 0.0500 time 0.5182 (0.5761) data time 0.0009 (0.0046) model time 0.5173 (0.5715) loss 6.5003 (6.6725) grad_norm 5.4588 (2.9042) loss_scale 256.0000 (129.7902) mem 22344MB +[2024-07-29 07:50:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][470/625] eta 0:01:29 lr 0.000027 wd 0.0500 time 0.5336 (0.5744) data time 0.0007 (0.0045) model time 0.5329 (0.5699) loss 5.0475 (6.6600) grad_norm 2.7938 (2.9058) loss_scale 256.0000 (134.0541) mem 22344MB +[2024-07-29 07:50:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][480/625] eta 0:01:23 lr 0.000027 wd 0.0500 time 0.5202 (0.5730) data time 0.0007 (0.0044) model time 0.5195 (0.5686) loss 6.0715 (6.6512) grad_norm 3.0569 (2.9417) loss_scale 256.0000 (138.0392) mem 22344MB +[2024-07-29 07:50:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][490/625] eta 0:01:17 lr 0.000027 wd 0.0500 time 0.5188 (0.5712) data time 0.0008 (0.0043) model time 0.5181 (0.5669) loss 6.4891 (6.6591) grad_norm 3.3630 (2.9651) loss_scale 256.0000 (141.7722) mem 22344MB +[2024-07-29 07:50:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][500/625] eta 0:01:11 lr 0.000027 wd 0.0500 time 0.5218 (0.5699) data time 0.0009 (0.0042) model time 0.5209 (0.5657) loss 6.9592 (6.6677) grad_norm 2.7181 (2.9613) loss_scale 256.0000 (145.2761) mem 22344MB +[2024-07-29 07:50:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][510/625] eta 0:01:05 lr 0.000027 wd 0.0500 time 0.5187 (0.5684) data time 0.0010 (0.0041) model time 0.5177 (0.5643) loss 7.7775 (6.6694) grad_norm 2.0641 (2.9591) loss_scale 256.0000 (148.5714) mem 22344MB +[2024-07-29 07:50:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][520/625] eta 0:00:59 lr 0.000027 wd 0.0500 time 0.5170 (0.5673) data time 0.0011 (0.0040) model time 0.5159 (0.5633) loss 7.5062 (6.6751) grad_norm 2.5844 (2.9420) loss_scale 256.0000 (151.6763) mem 22344MB +[2024-07-29 07:50:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][530/625] eta 0:00:53 lr 0.000027 wd 0.0500 time 0.5665 (0.5660) data time 0.0007 (0.0039) model time 0.5657 (0.5621) loss 6.3462 (6.6755) grad_norm 1.9269 (2.9585) loss_scale 256.0000 (154.6067) mem 22344MB +[2024-07-29 07:50:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][540/625] eta 0:00:48 lr 0.000027 wd 0.0500 time 0.5231 (0.5649) data time 0.0009 (0.0038) model time 0.5222 (0.5610) loss 6.7572 (6.6712) grad_norm 2.6969 (2.9442) loss_scale 256.0000 (157.3770) mem 22344MB +[2024-07-29 07:50:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][550/625] eta 0:00:42 lr 0.000027 wd 0.0500 time 0.5186 (0.5639) data time 0.0007 (0.0038) model time 0.5179 (0.5601) loss 5.5595 (6.6744) grad_norm 2.9347 (2.9327) loss_scale 256.0000 (160.0000) mem 22344MB +[2024-07-29 07:51:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][560/625] eta 0:00:36 lr 0.000027 wd 0.0500 time 0.5140 (0.5631) data time 0.0012 (0.0037) model time 0.5128 (0.5594) loss 7.1930 (6.6657) grad_norm 3.9777 (2.9368) loss_scale 256.0000 (162.4870) mem 22344MB +[2024-07-29 07:51:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][570/625] eta 0:00:30 lr 0.000027 wd 0.0500 time 0.5177 (0.5621) data time 0.0007 (0.0036) model time 0.5170 (0.5585) loss 7.0911 (6.6666) grad_norm 2.5981 (2.9703) loss_scale 256.0000 (164.8485) mem 22344MB +[2024-07-29 07:51:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][580/625] eta 0:00:25 lr 0.000027 wd 0.0500 time 0.5185 (0.5610) data time 0.0007 (0.0035) model time 0.5177 (0.5575) loss 7.2481 (6.6682) grad_norm 1.9535 (2.9884) loss_scale 256.0000 (167.0936) mem 22344MB +[2024-07-29 07:51:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][590/625] eta 0:00:19 lr 0.000027 wd 0.0500 time 0.5211 (0.5601) data time 0.0008 (0.0035) model time 0.5204 (0.5566) loss 5.8108 (6.6716) grad_norm 3.4115 (2.9810) loss_scale 256.0000 (169.2308) mem 22344MB +[2024-07-29 07:51:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][600/625] eta 0:00:13 lr 0.000027 wd 0.0500 time 0.5203 (0.5593) data time 0.0010 (0.0034) model time 0.5193 (0.5559) loss 6.5263 (6.6701) grad_norm 1.9318 (2.9819) loss_scale 256.0000 (171.2676) mem 22344MB +[2024-07-29 07:51:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][610/625] eta 0:00:08 lr 0.000027 wd 0.0500 time 0.5133 (0.5585) data time 0.0007 (0.0034) model time 0.5125 (0.5551) loss 5.4316 (6.6717) grad_norm 1.9747 (2.9764) loss_scale 256.0000 (173.2110) mem 22344MB +[2024-07-29 07:51:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [279/300][620/625] eta 0:00:02 lr 0.000027 wd 0.0500 time 0.5468 (0.5579) data time 0.0005 (0.0033) model time 0.5463 (0.5546) loss 6.3373 (6.6704) grad_norm 2.3102 (2.9620) loss_scale 256.0000 (175.0673) mem 22344MB +[2024-07-29 07:51:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 279 training takes 0:04:10 +[2024-07-29 07:51:36 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 07:51:39 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 07:51:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.587 (0.587) Loss 0.4880 (0.4880) Acc@1 90.527 (90.527) Acc@5 99.023 (99.023) Mem 22344MB +[2024-07-29 07:51:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.160) Loss 0.7383 (0.5908) Acc@1 82.861 (88.153) Acc@5 97.168 (98.162) Mem 22344MB +[2024-07-29 07:51:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.140) Loss 0.8110 (0.6766) Acc@1 81.299 (85.614) Acc@5 96.436 (97.419) Mem 22344MB +[2024-07-29 07:51:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.273 Acc@5 97.429 +[2024-07-29 07:51:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.3% +[2024-07-29 07:51:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 1.144 (1.144) Loss 0.4937 (0.4937) Acc@1 90.430 (90.430) Acc@5 99.023 (99.023) Mem 22344MB +[2024-07-29 07:51:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.212) Loss 0.7339 (0.5967) Acc@1 83.252 (88.255) Acc@5 97.217 (98.162) Mem 22344MB +[2024-07-29 07:51:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.167) Loss 0.8154 (0.6805) Acc@1 81.006 (85.612) Acc@5 96.387 (97.412) Mem 22344MB +[2024-07-29 07:51:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.225 Acc@5 97.407 +[2024-07-29 07:51:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.2% +[2024-07-29 07:51:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.22% +[2024-07-29 07:51:49 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 07:51:53 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 07:51:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][0/625] eta 0:12:49 lr 0.000027 wd 0.0500 time 1.2308 (1.2308) data time 0.4972 (0.4972) model time 0.0000 (0.0000) loss 6.5555 (6.5555) grad_norm 4.2530 (4.2530) loss_scale 256.0000 (256.0000) mem 22337MB +[2024-07-29 07:51:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][10/625] eta 0:06:03 lr 0.000027 wd 0.0500 time 0.5635 (0.5911) data time 0.0010 (0.0461) model time 0.0000 (0.0000) loss 6.9494 (6.2886) grad_norm 1.6030 (4.3981) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:52:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][20/625] eta 0:05:36 lr 0.000027 wd 0.0500 time 0.5217 (0.5569) data time 0.0010 (0.0246) model time 0.0000 (0.0000) loss 6.4316 (6.1880) grad_norm 2.1647 (3.4900) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:52:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][30/625] eta 0:05:25 lr 0.000027 wd 0.0500 time 0.5190 (0.5469) data time 0.0007 (0.0170) model time 0.0000 (0.0000) loss 6.9636 (6.2520) grad_norm 2.8777 (3.3025) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:52:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][40/625] eta 0:05:19 lr 0.000027 wd 0.0500 time 0.7202 (0.5463) data time 0.0011 (0.0131) model time 0.0000 (0.0000) loss 6.8132 (6.4551) grad_norm 3.1827 (3.1187) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:52:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][50/625] eta 0:05:11 lr 0.000027 wd 0.0500 time 0.5193 (0.5421) data time 0.0010 (0.0107) model time 0.0000 (0.0000) loss 7.4456 (6.4838) grad_norm 2.2587 (3.0990) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:52:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][60/625] eta 0:05:04 lr 0.000027 wd 0.0500 time 0.5164 (0.5389) data time 0.0008 (0.0091) model time 0.5157 (0.5213) loss 6.8009 (6.5488) grad_norm 3.2890 (3.0857) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:52:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][70/625] eta 0:04:57 lr 0.000027 wd 0.0500 time 0.5146 (0.5366) data time 0.0008 (0.0080) model time 0.5137 (0.5215) loss 6.1255 (6.5593) grad_norm 3.2743 (3.0286) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:52:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][80/625] eta 0:04:51 lr 0.000027 wd 0.0500 time 0.5320 (0.5350) data time 0.0012 (0.0071) model time 0.5308 (0.5218) loss 7.2489 (6.5277) grad_norm 2.0882 (2.9873) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:52:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][90/625] eta 0:04:45 lr 0.000027 wd 0.0500 time 0.5208 (0.5341) data time 0.0011 (0.0064) model time 0.5197 (0.5229) loss 7.2217 (6.5322) grad_norm 2.3712 (2.9793) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:52:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][100/625] eta 0:04:39 lr 0.000027 wd 0.0500 time 0.5178 (0.5329) data time 0.0008 (0.0059) model time 0.5170 (0.5225) loss 7.8342 (6.5128) grad_norm 2.9744 (2.9714) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:52:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][110/625] eta 0:04:34 lr 0.000027 wd 0.0500 time 0.5178 (0.5325) data time 0.0009 (0.0054) model time 0.5168 (0.5234) loss 6.5531 (6.5305) grad_norm 2.2645 (2.9447) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:52:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][120/625] eta 0:04:28 lr 0.000027 wd 0.0500 time 0.5180 (0.5315) data time 0.0010 (0.0051) model time 0.5170 (0.5228) loss 6.6295 (6.5512) grad_norm 2.5727 (2.9138) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:53:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][130/625] eta 0:04:22 lr 0.000027 wd 0.0500 time 0.5194 (0.5313) data time 0.0007 (0.0048) model time 0.5186 (0.5234) loss 7.3538 (6.5619) grad_norm 3.9811 (2.8909) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:53:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][140/625] eta 0:04:17 lr 0.000027 wd 0.0500 time 0.5211 (0.5309) data time 0.0011 (0.0045) model time 0.5200 (0.5235) loss 7.3916 (6.5752) grad_norm 3.4339 (2.8906) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:53:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][150/625] eta 0:04:12 lr 0.000027 wd 0.0500 time 0.5194 (0.5309) data time 0.0009 (0.0043) model time 0.5185 (0.5243) loss 5.3443 (6.5743) grad_norm 2.7811 (2.8676) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:53:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][160/625] eta 0:04:06 lr 0.000027 wd 0.0500 time 0.5744 (0.5306) data time 0.0010 (0.0041) model time 0.5735 (0.5243) loss 7.1961 (6.5703) grad_norm 2.2267 (2.8721) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:53:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][170/625] eta 0:04:01 lr 0.000026 wd 0.0500 time 0.5621 (0.5308) data time 0.0008 (0.0039) model time 0.5613 (0.5250) loss 7.0431 (6.5744) grad_norm 2.6721 (2.8959) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:53:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][180/625] eta 0:03:56 lr 0.000026 wd 0.0500 time 0.5584 (0.5304) data time 0.0011 (0.0037) model time 0.5573 (0.5248) loss 6.8109 (6.5737) grad_norm 4.8515 (2.9153) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:53:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][190/625] eta 0:03:50 lr 0.000026 wd 0.0500 time 0.5230 (0.5300) data time 0.0008 (0.0036) model time 0.5222 (0.5246) loss 5.6447 (6.5719) grad_norm 3.9312 (2.9081) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:53:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][200/625] eta 0:03:45 lr 0.000026 wd 0.0500 time 0.5180 (0.5300) data time 0.0007 (0.0035) model time 0.5173 (0.5249) loss 7.2782 (6.5757) grad_norm 3.6956 (2.8907) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:53:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][210/625] eta 0:03:39 lr 0.000026 wd 0.0500 time 0.5321 (0.5296) data time 0.0008 (0.0033) model time 0.5313 (0.5246) loss 5.5716 (6.5487) grad_norm 2.9384 (3.0011) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:53:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][220/625] eta 0:03:34 lr 0.000026 wd 0.0500 time 0.5204 (0.5302) data time 0.0008 (0.0032) model time 0.5196 (0.5256) loss 7.3177 (6.5658) grad_norm 2.2721 (2.9951) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:53:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][230/625] eta 0:03:29 lr 0.000026 wd 0.0500 time 0.5227 (0.5297) data time 0.0007 (0.0031) model time 0.5220 (0.5251) loss 7.1917 (6.5712) grad_norm 2.4814 (2.9856) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:54:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][240/625] eta 0:03:23 lr 0.000026 wd 0.0500 time 0.5163 (0.5295) data time 0.0009 (0.0030) model time 0.5154 (0.5251) loss 5.3585 (6.5667) grad_norm 7.4464 (3.0050) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:54:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][250/625] eta 0:03:18 lr 0.000026 wd 0.0500 time 0.5168 (0.5294) data time 0.0010 (0.0030) model time 0.5159 (0.5252) loss 8.0078 (6.5697) grad_norm 2.7678 (2.9833) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:54:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][260/625] eta 0:03:13 lr 0.000026 wd 0.0500 time 0.5169 (0.5294) data time 0.0008 (0.0029) model time 0.5161 (0.5253) loss 4.8899 (6.5687) grad_norm 2.6904 (2.9726) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:54:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][270/625] eta 0:03:08 lr 0.000026 wd 0.0500 time 0.5175 (0.5297) data time 0.0008 (0.0028) model time 0.5167 (0.5259) loss 7.8809 (6.5660) grad_norm 6.5193 (2.9758) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:54:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][280/625] eta 0:03:02 lr 0.000026 wd 0.0500 time 0.5175 (0.5298) data time 0.0009 (0.0027) model time 0.5166 (0.5260) loss 7.3549 (6.5684) grad_norm 3.4476 (2.9940) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:54:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][290/625] eta 0:02:57 lr 0.000026 wd 0.0500 time 0.5217 (0.5296) data time 0.0010 (0.0027) model time 0.5207 (0.5259) loss 6.8199 (6.5876) grad_norm 1.8500 (2.9743) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:54:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][300/625] eta 0:02:52 lr 0.000026 wd 0.0500 time 0.5176 (0.5294) data time 0.0010 (0.0026) model time 0.5166 (0.5259) loss 6.9856 (6.5832) grad_norm 6.2693 (2.9728) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:54:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][310/625] eta 0:02:46 lr 0.000026 wd 0.0500 time 0.5704 (0.5293) data time 0.0010 (0.0026) model time 0.5695 (0.5258) loss 7.7213 (6.5813) grad_norm 2.8115 (2.9728) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:54:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][320/625] eta 0:02:41 lr 0.000026 wd 0.0500 time 0.5177 (0.5292) data time 0.0009 (0.0025) model time 0.5168 (0.5258) loss 6.8371 (6.5803) grad_norm 2.0744 (3.0148) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:54:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][330/625] eta 0:02:36 lr 0.000026 wd 0.0500 time 0.5191 (0.5290) data time 0.0007 (0.0025) model time 0.5184 (0.5257) loss 7.7492 (6.5887) grad_norm 3.1555 (3.0276) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:54:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][340/625] eta 0:02:30 lr 0.000026 wd 0.0500 time 0.5164 (0.5290) data time 0.0011 (0.0024) model time 0.5153 (0.5257) loss 6.4962 (6.5872) grad_norm 2.0754 (3.0086) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:54:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][350/625] eta 0:02:25 lr 0.000026 wd 0.0500 time 0.5200 (0.5289) data time 0.0007 (0.0024) model time 0.5193 (0.5256) loss 7.0692 (6.5933) grad_norm 4.1576 (3.0423) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:55:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][360/625] eta 0:02:20 lr 0.000026 wd 0.0500 time 0.5726 (0.5290) data time 0.0008 (0.0024) model time 0.5719 (0.5258) loss 5.3976 (6.5772) grad_norm 2.0122 (3.0323) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:55:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][370/625] eta 0:02:14 lr 0.000026 wd 0.0500 time 0.5271 (0.5288) data time 0.0009 (0.0023) model time 0.5263 (0.5257) loss 6.5769 (6.5805) grad_norm 2.9257 (3.0240) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:55:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][380/625] eta 0:02:09 lr 0.000026 wd 0.0500 time 0.5190 (0.5287) data time 0.0012 (0.0023) model time 0.5178 (0.5256) loss 5.8585 (6.5771) grad_norm 8.0800 (3.0420) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:55:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][390/625] eta 0:02:04 lr 0.000026 wd 0.0500 time 0.5196 (0.5288) data time 0.0009 (0.0023) model time 0.5187 (0.5258) loss 5.9243 (6.5670) grad_norm 1.8821 (3.0375) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:55:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][400/625] eta 0:01:58 lr 0.000026 wd 0.0500 time 0.5221 (0.5287) data time 0.0007 (0.0022) model time 0.5214 (0.5257) loss 7.3317 (6.5661) grad_norm 4.0728 (3.0288) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:55:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][410/625] eta 0:01:53 lr 0.000026 wd 0.0500 time 0.5170 (0.5286) data time 0.0010 (0.0022) model time 0.5160 (0.5257) loss 7.3628 (6.5743) grad_norm 4.0020 (3.0212) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:55:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][420/625] eta 0:01:48 lr 0.000026 wd 0.0500 time 0.5153 (0.5283) data time 0.0009 (0.0022) model time 0.5144 (0.5255) loss 6.3026 (6.5737) grad_norm 5.7389 (3.0463) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:55:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][430/625] eta 0:01:43 lr 0.000026 wd 0.0500 time 0.5184 (0.5283) data time 0.0007 (0.0021) model time 0.5177 (0.5255) loss 6.1044 (6.5656) grad_norm 2.4834 (3.0378) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:55:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][440/625] eta 0:01:37 lr 0.000026 wd 0.0500 time 0.6907 (0.5287) data time 0.0009 (0.0021) model time 0.6898 (0.5259) loss 6.6872 (6.5663) grad_norm 2.0371 (3.0287) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:55:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][450/625] eta 0:01:32 lr 0.000026 wd 0.0500 time 0.5220 (0.5287) data time 0.0010 (0.0021) model time 0.5210 (0.5260) loss 7.6375 (6.5697) grad_norm 2.7883 (3.0225) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:55:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][460/625] eta 0:01:27 lr 0.000026 wd 0.0500 time 0.5258 (0.5285) data time 0.0010 (0.0021) model time 0.5248 (0.5258) loss 6.0856 (6.5710) grad_norm 2.1365 (3.0164) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:56:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][470/625] eta 0:01:21 lr 0.000026 wd 0.0500 time 0.5164 (0.5284) data time 0.0008 (0.0020) model time 0.5156 (0.5258) loss 6.4573 (6.5692) grad_norm 3.5493 (3.0134) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:56:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][480/625] eta 0:01:16 lr 0.000026 wd 0.0500 time 0.5156 (0.5283) data time 0.0010 (0.0020) model time 0.5146 (0.5257) loss 7.2633 (6.5767) grad_norm 2.8802 (3.0196) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:56:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][490/625] eta 0:01:11 lr 0.000026 wd 0.0500 time 0.5205 (0.5288) data time 0.0008 (0.0020) model time 0.5197 (0.5263) loss 7.3225 (6.5775) grad_norm 1.7721 (3.0082) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:56:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][500/625] eta 0:01:06 lr 0.000026 wd 0.0500 time 0.5162 (0.5287) data time 0.0010 (0.0020) model time 0.5152 (0.5262) loss 6.4558 (6.5709) grad_norm 2.6943 (3.0027) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 07:56:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 07:56:20 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 07:56:22 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 07:58:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 07:58:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 07:59:18 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 07:59:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 07:59:29 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 07:59:29 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 07:59:29 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 07:59:30 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 280) +[2024-07-29 07:59:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 07:59:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][510/625] eta 0:05:49 lr 0.000026 wd 0.0500 time 0.5287 (3.0369) data time 0.0009 (0.1697) model time 0.5279 (2.8672) loss 6.9247 (7.1670) grad_norm 1.8742 (2.7583) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 07:59:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][520/625] eta 0:02:33 lr 0.000026 wd 0.0500 time 0.5312 (1.4659) data time 0.0019 (0.0644) model time 0.5293 (1.4015) loss 8.1934 (7.1611) grad_norm 1.8114 (2.6928) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 08:00:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][530/625] eta 0:01:44 lr 0.000026 wd 0.0500 time 0.5377 (1.1036) data time 0.0008 (0.0401) model time 0.5369 (1.0635) loss 7.4221 (7.1256) grad_norm 2.0277 (2.5863) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 08:00:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][540/625] eta 0:01:20 lr 0.000026 wd 0.0500 time 0.5352 (0.9441) data time 0.0017 (0.0294) model time 0.5336 (0.9147) loss 6.6574 (7.0812) grad_norm 2.2197 (2.7000) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 08:00:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][550/625] eta 0:01:03 lr 0.000026 wd 0.0500 time 0.5296 (0.8530) data time 0.0012 (0.0233) model time 0.5285 (0.8298) loss 5.9079 (6.9158) grad_norm 3.3089 (2.6984) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 08:00:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][560/625] eta 0:00:52 lr 0.000026 wd 0.0500 time 0.5284 (0.8045) data time 0.0009 (0.0193) model time 0.5275 (0.7852) loss 7.2075 (6.8552) grad_norm 2.2102 (2.6884) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 08:00:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][570/625] eta 0:00:41 lr 0.000026 wd 0.0500 time 0.5178 (0.7618) data time 0.0009 (0.0165) model time 0.5169 (0.7453) loss 6.0563 (6.8086) grad_norm 2.4943 (2.7487) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 08:00:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][580/625] eta 0:00:32 lr 0.000026 wd 0.0500 time 0.5183 (0.7320) data time 0.0010 (0.0145) model time 0.5173 (0.7175) loss 7.4762 (6.7928) grad_norm 2.7161 (2.8224) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 08:00:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][590/625] eta 0:00:24 lr 0.000026 wd 0.0500 time 0.5184 (0.7072) data time 0.0008 (0.0129) model time 0.5176 (0.6942) loss 5.4539 (6.7763) grad_norm 2.4894 (2.7950) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 08:00:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][600/625] eta 0:00:17 lr 0.000026 wd 0.0500 time 0.5154 (0.6884) data time 0.0008 (0.0117) model time 0.5146 (0.6767) loss 6.7194 (6.7757) grad_norm 3.5879 (2.8106) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 08:00:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][610/625] eta 0:00:10 lr 0.000025 wd 0.0500 time 0.5149 (0.6728) data time 0.0008 (0.0107) model time 0.5141 (0.6620) loss 6.6978 (6.7884) grad_norm 2.5769 (2.7891) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 08:00:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [280/300][620/625] eta 0:00:03 lr 0.000025 wd 0.0500 time 0.5139 (0.6602) data time 0.0005 (0.0099) model time 0.5134 (0.6503) loss 6.9644 (6.7650) grad_norm 2.5532 (2.7719) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 08:00:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 280 training takes 0:01:18 +[2024-07-29 08:00:53 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 08:00:59 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 08:01:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.583 (0.583) Loss 0.4902 (0.4902) Acc@1 90.381 (90.381) Acc@5 99.023 (99.023) Mem 22344MB +[2024-07-29 08:01:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.117 (0.160) Loss 0.7349 (0.5902) Acc@1 83.447 (88.246) Acc@5 97.314 (98.180) Mem 22344MB +[2024-07-29 08:01:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.139) Loss 0.8057 (0.6747) Acc@1 80.713 (85.624) Acc@5 96.387 (97.431) Mem 22344MB +[2024-07-29 08:01:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.277 Acc@5 97.425 +[2024-07-29 08:01:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.3% +[2024-07-29 08:01:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 1.033 (1.033) Loss 0.4932 (0.4932) Acc@1 90.430 (90.430) Acc@5 99.023 (99.023) Mem 22344MB +[2024-07-29 08:01:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.201) Loss 0.7339 (0.5961) Acc@1 83.301 (88.281) Acc@5 97.217 (98.153) Mem 22344MB +[2024-07-29 08:01:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.161) Loss 0.8145 (0.6800) Acc@1 81.006 (85.638) Acc@5 96.387 (97.407) Mem 22344MB +[2024-07-29 08:01:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.253 Acc@5 97.403 +[2024-07-29 08:01:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.3% +[2024-07-29 08:01:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.25% +[2024-07-29 08:01:09 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 08:01:11 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 08:01:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][0/625] eta 0:14:36 lr 0.000025 wd 0.0500 time 1.4020 (1.4020) data time 0.6042 (0.6042) model time 0.0000 (0.0000) loss 5.6711 (5.6711) grad_norm 2.7960 (2.7960) loss_scale 256.0000 (256.0000) mem 22337MB +[2024-07-29 08:01:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][10/625] eta 0:06:11 lr 0.000025 wd 0.0500 time 0.5170 (0.6041) data time 0.0010 (0.0560) model time 0.0000 (0.0000) loss 7.1398 (6.4827) grad_norm 2.3213 (2.7612) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:01:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][20/625] eta 0:05:42 lr 0.000025 wd 0.0500 time 0.5406 (0.5665) data time 0.0009 (0.0299) model time 0.0000 (0.0000) loss 6.3202 (6.5444) grad_norm 2.7303 (2.7609) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:01:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][30/625] eta 0:05:29 lr 0.000025 wd 0.0500 time 0.5558 (0.5536) data time 0.0010 (0.0206) model time 0.0000 (0.0000) loss 6.3623 (6.5534) grad_norm 2.8654 (2.8282) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:01:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][40/625] eta 0:05:19 lr 0.000025 wd 0.0500 time 0.5177 (0.5462) data time 0.0013 (0.0159) model time 0.0000 (0.0000) loss 8.0087 (6.6110) grad_norm 2.5511 (3.0847) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:01:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][50/625] eta 0:05:12 lr 0.000025 wd 0.0500 time 0.5784 (0.5428) data time 0.0012 (0.0130) model time 0.0000 (0.0000) loss 7.1775 (6.6182) grad_norm 2.4409 (3.0281) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:01:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][60/625] eta 0:05:05 lr 0.000025 wd 0.0500 time 0.5194 (0.5399) data time 0.0011 (0.0111) model time 0.5183 (0.5239) loss 7.7557 (6.6279) grad_norm 2.4298 (2.9354) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:01:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][70/625] eta 0:04:59 lr 0.000025 wd 0.0500 time 0.5518 (0.5390) data time 0.0014 (0.0097) model time 0.5504 (0.5279) loss 6.5012 (6.6319) grad_norm 1.8399 (2.8782) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:01:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][80/625] eta 0:04:52 lr 0.000025 wd 0.0500 time 0.5185 (0.5370) data time 0.0010 (0.0086) model time 0.5175 (0.5260) loss 6.8487 (6.5800) grad_norm 2.1772 (2.8596) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:02:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][90/625] eta 0:04:47 lr 0.000025 wd 0.0500 time 0.5158 (0.5378) data time 0.0010 (0.0078) model time 0.5149 (0.5303) loss 6.5757 (6.5583) grad_norm 2.9924 (2.8456) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:02:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][100/625] eta 0:04:41 lr 0.000025 wd 0.0500 time 0.5158 (0.5363) data time 0.0008 (0.0071) model time 0.5150 (0.5285) loss 6.6790 (6.5591) grad_norm 2.9527 (2.8803) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:02:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][110/625] eta 0:04:36 lr 0.000025 wd 0.0500 time 0.5260 (0.5360) data time 0.0007 (0.0066) model time 0.5252 (0.5290) loss 5.6749 (6.5706) grad_norm 2.8460 (2.8581) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:02:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][120/625] eta 0:04:30 lr 0.000025 wd 0.0500 time 0.5222 (0.5356) data time 0.0008 (0.0061) model time 0.5214 (0.5291) loss 6.4771 (6.5597) grad_norm 1.9869 (2.9317) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:02:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][130/625] eta 0:04:24 lr 0.000025 wd 0.0500 time 0.5319 (0.5346) data time 0.0009 (0.0058) model time 0.5310 (0.5282) loss 6.0372 (6.5512) grad_norm 2.1364 (2.8991) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:02:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][140/625] eta 0:04:18 lr 0.000025 wd 0.0500 time 0.5193 (0.5337) data time 0.0008 (0.0054) model time 0.5184 (0.5274) loss 6.3313 (6.5427) grad_norm 2.6331 (2.8793) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:02:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][150/625] eta 0:04:13 lr 0.000025 wd 0.0500 time 0.5295 (0.5338) data time 0.0011 (0.0052) model time 0.5285 (0.5281) loss 6.6158 (6.5267) grad_norm 2.9665 (2.9199) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:02:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][160/625] eta 0:04:09 lr 0.000025 wd 0.0500 time 0.5193 (0.5358) data time 0.0014 (0.0049) model time 0.5179 (0.5313) loss 6.7434 (6.5562) grad_norm 2.5750 (2.8977) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:02:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][170/625] eta 0:04:03 lr 0.000025 wd 0.0500 time 0.5157 (0.5356) data time 0.0008 (0.0047) model time 0.5149 (0.5314) loss 5.4864 (6.5599) grad_norm 2.3114 (2.8747) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:02:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][180/625] eta 0:03:58 lr 0.000025 wd 0.0500 time 0.5449 (0.5351) data time 0.0010 (0.0045) model time 0.5439 (0.5309) loss 6.9166 (6.5463) grad_norm 2.5194 (2.8444) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:02:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][190/625] eta 0:03:52 lr 0.000025 wd 0.0500 time 0.5171 (0.5351) data time 0.0011 (0.0043) model time 0.5160 (0.5311) loss 7.0593 (6.5558) grad_norm 1.8798 (2.8406) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:02:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][200/625] eta 0:03:47 lr 0.000025 wd 0.0500 time 0.5419 (0.5351) data time 0.0012 (0.0042) model time 0.5406 (0.5313) loss 7.4563 (6.5898) grad_norm 3.4240 (2.8456) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:03:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][210/625] eta 0:03:41 lr 0.000025 wd 0.0500 time 0.5147 (0.5346) data time 0.0012 (0.0040) model time 0.5135 (0.5307) loss 5.2045 (6.5837) grad_norm 2.7906 (2.8386) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:03:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][220/625] eta 0:03:36 lr 0.000025 wd 0.0500 time 0.5577 (0.5344) data time 0.0012 (0.0039) model time 0.5565 (0.5307) loss 8.2456 (6.5929) grad_norm 2.0721 (2.8253) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:03:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][230/625] eta 0:03:30 lr 0.000025 wd 0.0500 time 0.5234 (0.5341) data time 0.0015 (0.0038) model time 0.5219 (0.5304) loss 6.3920 (6.5865) grad_norm 3.1065 (2.8691) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:03:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][240/625] eta 0:03:25 lr 0.000025 wd 0.0500 time 0.5238 (0.5337) data time 0.0009 (0.0037) model time 0.5229 (0.5301) loss 5.9248 (6.5901) grad_norm 2.2395 (2.9177) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:03:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][250/625] eta 0:03:20 lr 0.000025 wd 0.0500 time 0.5429 (0.5338) data time 0.0012 (0.0036) model time 0.5417 (0.5302) loss 5.3604 (6.5881) grad_norm 2.3295 (2.9012) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:03:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][260/625] eta 0:03:14 lr 0.000025 wd 0.0500 time 0.5179 (0.5335) data time 0.0007 (0.0035) model time 0.5171 (0.5301) loss 5.6067 (6.5864) grad_norm 2.1722 (2.8946) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:03:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][270/625] eta 0:03:09 lr 0.000025 wd 0.0500 time 0.5152 (0.5332) data time 0.0009 (0.0034) model time 0.5143 (0.5298) loss 7.0306 (6.5798) grad_norm 2.6632 (2.8914) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:03:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][280/625] eta 0:03:03 lr 0.000025 wd 0.0500 time 0.5241 (0.5332) data time 0.0014 (0.0033) model time 0.5227 (0.5299) loss 6.7267 (6.5823) grad_norm 2.8789 (2.8772) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:03:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][290/625] eta 0:02:58 lr 0.000025 wd 0.0500 time 0.5228 (0.5327) data time 0.0011 (0.0033) model time 0.5217 (0.5294) loss 7.4624 (6.5878) grad_norm 3.2267 (2.8688) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:03:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][300/625] eta 0:02:53 lr 0.000025 wd 0.0500 time 0.5177 (0.5326) data time 0.0008 (0.0032) model time 0.5169 (0.5293) loss 6.7949 (6.5815) grad_norm 2.7046 (2.8660) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:03:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][310/625] eta 0:02:47 lr 0.000025 wd 0.0500 time 0.5328 (0.5329) data time 0.0008 (0.0031) model time 0.5320 (0.5297) loss 6.2105 (6.5907) grad_norm 2.5707 (2.8637) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:04:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][320/625] eta 0:02:42 lr 0.000025 wd 0.0500 time 0.5164 (0.5327) data time 0.0008 (0.0031) model time 0.5157 (0.5296) loss 7.4548 (6.5880) grad_norm 1.9781 (2.8527) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:04:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][330/625] eta 0:02:37 lr 0.000025 wd 0.0500 time 0.5425 (0.5324) data time 0.0008 (0.0030) model time 0.5418 (0.5293) loss 5.8117 (6.5900) grad_norm 2.1057 (2.8634) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:04:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][340/625] eta 0:02:31 lr 0.000025 wd 0.0500 time 0.5165 (0.5322) data time 0.0010 (0.0030) model time 0.5154 (0.5292) loss 7.4831 (6.5860) grad_norm 3.9937 (2.8659) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:04:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][350/625] eta 0:02:26 lr 0.000025 wd 0.0500 time 0.5176 (0.5321) data time 0.0010 (0.0029) model time 0.5166 (0.5291) loss 6.0612 (6.5725) grad_norm 2.6501 (2.8671) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:04:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][360/625] eta 0:02:20 lr 0.000025 wd 0.0500 time 0.5465 (0.5319) data time 0.0007 (0.0029) model time 0.5457 (0.5290) loss 6.3555 (6.5727) grad_norm 2.2748 (2.8601) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:04:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][370/625] eta 0:02:15 lr 0.000025 wd 0.0500 time 0.8103 (0.5326) data time 0.0010 (0.0028) model time 0.8093 (0.5298) loss 7.8615 (6.5867) grad_norm 3.1103 (2.8589) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:04:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][380/625] eta 0:02:10 lr 0.000025 wd 0.0500 time 0.5216 (0.5330) data time 0.0011 (0.0028) model time 0.5205 (0.5303) loss 7.1848 (6.5796) grad_norm 2.4429 (2.8543) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:04:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][390/625] eta 0:02:05 lr 0.000025 wd 0.0500 time 0.6043 (0.5330) data time 0.0018 (0.0027) model time 0.6024 (0.5304) loss 7.0018 (6.5932) grad_norm 2.4963 (2.8425) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:04:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][400/625] eta 0:01:59 lr 0.000025 wd 0.0500 time 0.5166 (0.5327) data time 0.0008 (0.0027) model time 0.5158 (0.5301) loss 5.6932 (6.5885) grad_norm 2.3778 (2.8389) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:04:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][410/625] eta 0:01:54 lr 0.000025 wd 0.0500 time 0.5200 (0.5327) data time 0.0011 (0.0027) model time 0.5189 (0.5301) loss 7.4265 (6.5858) grad_norm 2.4972 (2.8302) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:04:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][420/625] eta 0:01:49 lr 0.000025 wd 0.0500 time 0.5195 (0.5326) data time 0.0011 (0.0026) model time 0.5184 (0.5300) loss 7.6201 (6.5808) grad_norm 16.2941 (2.8564) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:05:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][430/625] eta 0:01:43 lr 0.000024 wd 0.0500 time 0.5162 (0.5325) data time 0.0007 (0.0026) model time 0.5155 (0.5300) loss 7.5063 (6.5795) grad_norm 2.1970 (2.8594) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:05:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][440/625] eta 0:01:38 lr 0.000024 wd 0.0500 time 0.5402 (0.5323) data time 0.0017 (0.0026) model time 0.5385 (0.5298) loss 7.6223 (6.5835) grad_norm 1.9776 (2.8496) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:05:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][450/625] eta 0:01:33 lr 0.000024 wd 0.0500 time 0.5176 (0.5322) data time 0.0010 (0.0025) model time 0.5165 (0.5296) loss 6.9498 (6.5897) grad_norm 2.6958 (2.8357) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:05:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][460/625] eta 0:01:27 lr 0.000024 wd 0.0500 time 0.5168 (0.5320) data time 0.0008 (0.0025) model time 0.5161 (0.5295) loss 7.3029 (6.5859) grad_norm 2.4697 (2.8279) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:05:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][470/625] eta 0:01:22 lr 0.000024 wd 0.0500 time 0.5192 (0.5324) data time 0.0014 (0.0025) model time 0.5178 (0.5299) loss 7.7139 (6.5914) grad_norm 2.4352 (2.8257) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:05:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][480/625] eta 0:01:17 lr 0.000024 wd 0.0500 time 0.5162 (0.5322) data time 0.0012 (0.0024) model time 0.5149 (0.5297) loss 5.6903 (6.5875) grad_norm 3.0795 (2.8301) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:05:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][490/625] eta 0:01:11 lr 0.000024 wd 0.0500 time 0.5402 (0.5324) data time 0.0014 (0.0024) model time 0.5388 (0.5300) loss 6.3642 (6.5867) grad_norm 4.0705 (2.8261) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:05:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][500/625] eta 0:01:06 lr 0.000024 wd 0.0500 time 0.6441 (0.5325) data time 0.0008 (0.0024) model time 0.6433 (0.5301) loss 7.3193 (6.5947) grad_norm 3.0810 (2.8300) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:05:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][510/625] eta 0:01:01 lr 0.000024 wd 0.0500 time 0.5240 (0.5325) data time 0.0010 (0.0024) model time 0.5230 (0.5302) loss 7.3035 (6.5970) grad_norm 3.4030 (2.8323) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:05:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][520/625] eta 0:00:55 lr 0.000024 wd 0.0500 time 0.5253 (0.5324) data time 0.0008 (0.0023) model time 0.5245 (0.5301) loss 5.7621 (6.5996) grad_norm 3.1250 (2.8334) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:05:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][530/625] eta 0:00:50 lr 0.000024 wd 0.0500 time 0.6802 (0.5325) data time 0.0008 (0.0023) model time 0.6795 (0.5302) loss 7.9006 (6.5965) grad_norm 2.9583 (2.8286) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:05:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][540/625] eta 0:00:45 lr 0.000024 wd 0.0500 time 0.5548 (0.5324) data time 0.0009 (0.0023) model time 0.5539 (0.5302) loss 5.7775 (6.5872) grad_norm 1.7417 (2.8205) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:06:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][550/625] eta 0:00:39 lr 0.000024 wd 0.0500 time 0.5719 (0.5324) data time 0.0009 (0.0023) model time 0.5710 (0.5301) loss 7.9055 (6.5898) grad_norm 22.1361 (2.8675) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:06:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][560/625] eta 0:00:34 lr 0.000024 wd 0.0500 time 0.5157 (0.5323) data time 0.0008 (0.0023) model time 0.5149 (0.5300) loss 7.5512 (6.5926) grad_norm 2.5142 (2.8588) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:06:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][570/625] eta 0:00:29 lr 0.000024 wd 0.0500 time 0.5213 (0.5322) data time 0.0010 (0.0023) model time 0.5202 (0.5299) loss 5.6195 (6.5892) grad_norm 2.6976 (2.8628) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:06:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][580/625] eta 0:00:23 lr 0.000024 wd 0.0500 time 0.5155 (0.5322) data time 0.0011 (0.0022) model time 0.5144 (0.5299) loss 7.7338 (6.5894) grad_norm 2.0935 (2.8591) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:06:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][590/625] eta 0:00:18 lr 0.000024 wd 0.0500 time 0.5164 (0.5320) data time 0.0008 (0.0022) model time 0.5156 (0.5298) loss 6.4879 (6.5899) grad_norm 1.7243 (2.8522) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:06:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][600/625] eta 0:00:13 lr 0.000024 wd 0.0500 time 0.5441 (0.5326) data time 0.0011 (0.0022) model time 0.5430 (0.5304) loss 6.7205 (6.5917) grad_norm 2.3148 (2.8468) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:06:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][610/625] eta 0:00:07 lr 0.000024 wd 0.0500 time 0.5149 (0.5324) data time 0.0008 (0.0022) model time 0.5140 (0.5302) loss 7.1809 (6.5918) grad_norm 3.1677 (2.8462) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:06:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [281/300][620/625] eta 0:00:02 lr 0.000024 wd 0.0500 time 0.5153 (0.5323) data time 0.0008 (0.0022) model time 0.5145 (0.5301) loss 7.5881 (6.6021) grad_norm 2.2869 (2.8456) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:06:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 281 training takes 0:05:32 +[2024-07-29 08:06:44 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 08:06:45 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 08:06:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.583 (0.583) Loss 0.4897 (0.4897) Acc@1 90.430 (90.430) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-29 08:06:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.117 (0.160) Loss 0.7354 (0.5914) Acc@1 83.252 (88.303) Acc@5 97.217 (98.153) Mem 22339MB +[2024-07-29 08:06:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.139) Loss 0.8096 (0.6763) Acc@1 81.055 (85.658) Acc@5 96.387 (97.428) Mem 22339MB +[2024-07-29 08:06:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.285 Acc@5 97.431 +[2024-07-29 08:06:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.3% +[2024-07-29 08:06:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.947 (0.947) Loss 0.4927 (0.4927) Acc@1 90.430 (90.430) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-29 08:06:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.197) Loss 0.7334 (0.5958) Acc@1 83.252 (88.263) Acc@5 97.363 (98.158) Mem 22339MB +[2024-07-29 08:06:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.159) Loss 0.8140 (0.6797) Acc@1 81.055 (85.631) Acc@5 96.387 (97.398) Mem 22339MB +[2024-07-29 08:06:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.251 Acc@5 97.393 +[2024-07-29 08:06:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.3% +[2024-07-29 08:06:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][0/625] eta 0:16:25 lr 0.000024 wd 0.0500 time 1.5775 (1.5775) data time 0.9042 (0.9042) model time 0.0000 (0.0000) loss 6.9132 (6.9132) grad_norm 1.9512 (1.9512) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:06:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][10/625] eta 0:06:21 lr 0.000024 wd 0.0500 time 0.5182 (0.6210) data time 0.0010 (0.0833) model time 0.0000 (0.0000) loss 7.1626 (6.4215) grad_norm 2.6972 (2.9830) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:07:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][20/625] eta 0:05:47 lr 0.000024 wd 0.0500 time 0.5157 (0.5742) data time 0.0011 (0.0442) model time 0.0000 (0.0000) loss 6.5100 (6.5197) grad_norm 2.7282 (3.3123) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:07:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][30/625] eta 0:05:33 lr 0.000024 wd 0.0500 time 0.5651 (0.5604) data time 0.0011 (0.0303) model time 0.0000 (0.0000) loss 7.1869 (6.5733) grad_norm 2.3555 (3.1516) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:07:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][40/625] eta 0:05:21 lr 0.000024 wd 0.0500 time 0.5161 (0.5500) data time 0.0009 (0.0232) model time 0.0000 (0.0000) loss 5.9294 (6.6009) grad_norm 2.4058 (3.0019) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:07:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][50/625] eta 0:05:14 lr 0.000024 wd 0.0500 time 0.5187 (0.5463) data time 0.0009 (0.0189) model time 0.0000 (0.0000) loss 5.9123 (6.6037) grad_norm 2.6915 (2.8602) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:07:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][60/625] eta 0:05:06 lr 0.000024 wd 0.0500 time 0.5241 (0.5429) data time 0.0009 (0.0160) model time 0.5232 (0.5245) loss 6.0120 (6.5665) grad_norm 3.1512 (2.8356) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:07:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][70/625] eta 0:05:00 lr 0.000024 wd 0.0500 time 0.5244 (0.5409) data time 0.0008 (0.0139) model time 0.5236 (0.5258) loss 6.8371 (6.5558) grad_norm 2.9900 (2.7981) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:07:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][80/625] eta 0:04:53 lr 0.000024 wd 0.0500 time 0.5249 (0.5381) data time 0.0010 (0.0123) model time 0.5240 (0.5231) loss 5.5325 (6.5547) grad_norm 2.3828 (2.8656) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:07:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][90/625] eta 0:04:48 lr 0.000024 wd 0.0500 time 0.7162 (0.5391) data time 0.0007 (0.0111) model time 0.7155 (0.5288) loss 6.4982 (6.5405) grad_norm 2.4719 (2.8801) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:07:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][100/625] eta 0:04:42 lr 0.000024 wd 0.0500 time 0.5404 (0.5381) data time 0.0009 (0.0101) model time 0.5395 (0.5286) loss 7.2012 (6.5157) grad_norm 2.7053 (2.8761) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:07:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][110/625] eta 0:04:36 lr 0.000024 wd 0.0500 time 0.5169 (0.5373) data time 0.0010 (0.0093) model time 0.5159 (0.5285) loss 7.6132 (6.5337) grad_norm 4.1436 (2.8707) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:07:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][120/625] eta 0:04:30 lr 0.000024 wd 0.0500 time 0.5625 (0.5362) data time 0.0014 (0.0086) model time 0.5611 (0.5278) loss 7.8890 (6.5434) grad_norm 2.5970 (2.9303) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:08:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][130/625] eta 0:04:25 lr 0.000024 wd 0.0500 time 0.5169 (0.5356) data time 0.0010 (0.0080) model time 0.5158 (0.5276) loss 6.0957 (6.5441) grad_norm 2.0397 (2.9646) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:08:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][140/625] eta 0:04:19 lr 0.000024 wd 0.0500 time 0.5338 (0.5351) data time 0.0008 (0.0075) model time 0.5329 (0.5277) loss 6.8839 (6.5385) grad_norm 2.1661 (2.9792) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:08:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][150/625] eta 0:04:14 lr 0.000024 wd 0.0500 time 0.5171 (0.5351) data time 0.0010 (0.0071) model time 0.5161 (0.5282) loss 6.6127 (6.5483) grad_norm 9.2337 (2.9926) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:08:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][160/625] eta 0:04:08 lr 0.000024 wd 0.0500 time 0.5415 (0.5344) data time 0.0014 (0.0068) model time 0.5401 (0.5277) loss 7.3035 (6.5498) grad_norm 2.5530 (2.9766) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:08:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][170/625] eta 0:04:02 lr 0.000024 wd 0.0500 time 0.5729 (0.5337) data time 0.0012 (0.0064) model time 0.5718 (0.5272) loss 6.9407 (6.5370) grad_norm 2.8804 (3.0432) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:08:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][180/625] eta 0:03:57 lr 0.000024 wd 0.0500 time 0.5287 (0.5333) data time 0.0011 (0.0061) model time 0.5277 (0.5270) loss 7.4877 (6.5566) grad_norm 2.4510 (3.1365) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:08:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][190/625] eta 0:03:52 lr 0.000024 wd 0.0500 time 0.5162 (0.5336) data time 0.0008 (0.0059) model time 0.5154 (0.5279) loss 5.7031 (6.5588) grad_norm 1.7659 (3.0929) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:08:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][200/625] eta 0:03:47 lr 0.000024 wd 0.0500 time 0.5199 (0.5345) data time 0.0012 (0.0056) model time 0.5187 (0.5294) loss 6.8825 (6.5549) grad_norm 2.4576 (3.0676) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:08:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][210/625] eta 0:03:41 lr 0.000024 wd 0.0500 time 0.5231 (0.5341) data time 0.0009 (0.0054) model time 0.5222 (0.5291) loss 7.2986 (6.5374) grad_norm 1.8557 (3.0501) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:08:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][220/625] eta 0:03:36 lr 0.000024 wd 0.0500 time 0.5179 (0.5337) data time 0.0011 (0.0052) model time 0.5168 (0.5287) loss 8.5255 (6.5553) grad_norm 2.6803 (3.0278) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:08:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][230/625] eta 0:03:30 lr 0.000024 wd 0.0500 time 0.5197 (0.5331) data time 0.0014 (0.0051) model time 0.5183 (0.5282) loss 7.0389 (6.5642) grad_norm 2.2726 (3.0169) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:09:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][240/625] eta 0:03:25 lr 0.000024 wd 0.0500 time 0.5183 (0.5331) data time 0.0011 (0.0049) model time 0.5173 (0.5284) loss 5.6324 (6.5675) grad_norm 3.2687 (3.0353) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:09:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][250/625] eta 0:03:19 lr 0.000024 wd 0.0500 time 0.5243 (0.5331) data time 0.0010 (0.0047) model time 0.5233 (0.5286) loss 6.3133 (6.5791) grad_norm 3.3451 (3.0252) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:09:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][260/625] eta 0:03:14 lr 0.000024 wd 0.0500 time 0.5174 (0.5329) data time 0.0011 (0.0046) model time 0.5163 (0.5285) loss 6.4966 (6.5696) grad_norm 1.8688 (3.0078) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:09:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][270/625] eta 0:03:09 lr 0.000024 wd 0.0500 time 0.5583 (0.5326) data time 0.0033 (0.0045) model time 0.5550 (0.5283) loss 6.2867 (6.5841) grad_norm 2.1841 (3.0088) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:09:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][280/625] eta 0:03:03 lr 0.000023 wd 0.0500 time 0.5232 (0.5323) data time 0.0010 (0.0044) model time 0.5222 (0.5280) loss 6.6273 (6.5650) grad_norm 2.4883 (3.0058) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:09:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][290/625] eta 0:02:58 lr 0.000023 wd 0.0500 time 0.5161 (0.5319) data time 0.0009 (0.0043) model time 0.5151 (0.5277) loss 7.5233 (6.5660) grad_norm 2.0538 (3.0137) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:09:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][300/625] eta 0:02:52 lr 0.000023 wd 0.0500 time 0.5160 (0.5318) data time 0.0011 (0.0041) model time 0.5148 (0.5277) loss 6.6834 (6.5745) grad_norm 2.3106 (3.0151) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:09:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][310/625] eta 0:02:47 lr 0.000023 wd 0.0500 time 0.5341 (0.5321) data time 0.0010 (0.0041) model time 0.5331 (0.5282) loss 6.6050 (6.5789) grad_norm 2.7323 (3.0127) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:09:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][320/625] eta 0:02:42 lr 0.000023 wd 0.0500 time 0.5170 (0.5320) data time 0.0008 (0.0040) model time 0.5162 (0.5281) loss 6.2520 (6.5818) grad_norm 3.0086 (3.0043) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:09:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][330/625] eta 0:02:36 lr 0.000023 wd 0.0500 time 0.5401 (0.5317) data time 0.0015 (0.0039) model time 0.5386 (0.5279) loss 6.5103 (6.5725) grad_norm 1.9279 (3.0015) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:09:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][340/625] eta 0:02:31 lr 0.000023 wd 0.0500 time 0.5175 (0.5315) data time 0.0010 (0.0038) model time 0.5165 (0.5278) loss 5.8429 (6.5776) grad_norm 2.6260 (2.9923) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:09:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][350/625] eta 0:02:26 lr 0.000023 wd 0.0500 time 0.5363 (0.5314) data time 0.0013 (0.0037) model time 0.5351 (0.5277) loss 7.3874 (6.5742) grad_norm 2.4291 (3.0271) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:10:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][360/625] eta 0:02:20 lr 0.000023 wd 0.0500 time 0.5168 (0.5312) data time 0.0010 (0.0037) model time 0.5158 (0.5276) loss 7.3950 (6.5810) grad_norm 2.5508 (3.0361) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:10:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][370/625] eta 0:02:15 lr 0.000023 wd 0.0500 time 0.5430 (0.5313) data time 0.0016 (0.0036) model time 0.5414 (0.5278) loss 7.1200 (6.5878) grad_norm 2.7051 (3.0211) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:10:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][380/625] eta 0:02:10 lr 0.000023 wd 0.0500 time 0.5318 (0.5310) data time 0.0014 (0.0035) model time 0.5304 (0.5275) loss 7.1174 (6.5927) grad_norm 5.7317 (3.0269) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:10:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][390/625] eta 0:02:04 lr 0.000023 wd 0.0500 time 0.5175 (0.5308) data time 0.0010 (0.0035) model time 0.5165 (0.5273) loss 7.3267 (6.5910) grad_norm 2.3036 (3.0247) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:10:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][400/625] eta 0:01:59 lr 0.000023 wd 0.0500 time 0.5332 (0.5306) data time 0.0008 (0.0034) model time 0.5324 (0.5272) loss 5.8854 (6.5930) grad_norm 1.8656 (3.0071) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:10:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][410/625] eta 0:01:54 lr 0.000023 wd 0.0500 time 0.7295 (0.5316) data time 0.0008 (0.0034) model time 0.7287 (0.5284) loss 6.3922 (6.5970) grad_norm 3.2718 (3.0125) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:10:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][420/625] eta 0:01:48 lr 0.000023 wd 0.0500 time 0.5166 (0.5315) data time 0.0007 (0.0033) model time 0.5158 (0.5283) loss 7.1984 (6.5844) grad_norm 2.8042 (3.0021) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 08:10:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 08:10:38 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 08:10:39 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 08:12:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 08:12:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 08:12:56 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 08:13:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 08:13:08 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 08:13:08 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 08:13:09 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 08:13:09 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 282) +[2024-07-29 08:13:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 08:13:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][430/625] eta 0:07:04 lr 0.000023 wd 0.0500 time 0.5880 (2.1765) data time 0.0012 (0.1125) model time 0.5868 (2.0639) loss 6.5759 (6.7911) grad_norm 2.5514 (2.3140) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 08:13:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][440/625] eta 0:03:49 lr 0.000023 wd 0.0500 time 0.5825 (1.2417) data time 0.0010 (0.0470) model time 0.5814 (1.1947) loss 6.1271 (6.7137) grad_norm 2.9317 (2.4928) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 08:13:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][450/625] eta 0:02:54 lr 0.000023 wd 0.0500 time 0.5848 (0.9990) data time 0.0007 (0.0299) model time 0.5840 (0.9691) loss 6.9505 (6.7353) grad_norm 2.6851 (2.4901) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 08:13:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][460/625] eta 0:02:26 lr 0.000023 wd 0.0500 time 0.5855 (0.8873) data time 0.0010 (0.0221) model time 0.5845 (0.8651) loss 6.6943 (6.7845) grad_norm 3.9355 (2.4798) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 08:13:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][470/625] eta 0:02:07 lr 0.000023 wd 0.0500 time 0.5852 (0.8233) data time 0.0009 (0.0176) model time 0.5844 (0.8056) loss 6.3106 (6.7621) grad_norm 2.5165 (2.4572) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 08:13:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][480/625] eta 0:01:54 lr 0.000023 wd 0.0500 time 0.5874 (0.7896) data time 0.0010 (0.0147) model time 0.5863 (0.7749) loss 6.3857 (6.7765) grad_norm 2.5146 (2.7306) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 08:14:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 08:14:03 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 08:14:09 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 08:16:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 08:16:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 08:16:25 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 08:16:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 08:16:43 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 08:16:44 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 08:16:44 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 08:16:44 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 282) +[2024-07-29 08:16:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 08:17:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][490/625] eta 0:09:53 lr 0.000023 wd 0.0500 time 0.5990 (4.3980) data time 0.0009 (0.2953) model time 0.5982 (4.1027) loss 6.3639 (6.9084) grad_norm 1.8998 (1.9710) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 08:17:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][500/625] eta 0:03:04 lr 0.000023 wd 0.0500 time 0.5950 (1.4722) data time 0.0010 (0.0690) model time 0.5940 (1.4032) loss 6.4780 (6.8575) grad_norm 2.0377 (2.4645) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 08:17:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][510/625] eta 0:02:05 lr 0.000023 wd 0.0500 time 0.5885 (1.0912) data time 0.0009 (0.0395) model time 0.5877 (1.0517) loss 7.5357 (6.8539) grad_norm 2.5186 (2.4359) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 08:17:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][520/625] eta 0:01:38 lr 0.000023 wd 0.0500 time 0.5923 (0.9389) data time 0.0009 (0.0278) model time 0.5914 (0.9110) loss 7.1367 (6.8668) grad_norm 3.5493 (3.0159) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 08:17:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][530/625] eta 0:01:21 lr 0.000023 wd 0.0500 time 0.5868 (0.8574) data time 0.0012 (0.0216) model time 0.5856 (0.8357) loss 6.7741 (6.7308) grad_norm 1.9991 (3.1117) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 08:17:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][540/625] eta 0:01:08 lr 0.000023 wd 0.0500 time 0.5889 (0.8103) data time 0.0011 (0.0178) model time 0.5878 (0.7926) loss 7.4222 (6.7286) grad_norm 2.9884 (3.0048) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 08:17:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][550/625] eta 0:00:58 lr 0.000023 wd 0.0500 time 0.5995 (0.7795) data time 0.0011 (0.0151) model time 0.5984 (0.7644) loss 6.0452 (6.6762) grad_norm 2.7129 (2.9985) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 08:17:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][560/625] eta 0:00:49 lr 0.000023 wd 0.0500 time 0.6001 (0.7549) data time 0.0012 (0.0132) model time 0.5989 (0.7417) loss 8.0209 (6.6759) grad_norm 2.1395 (2.9612) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 08:17:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][570/625] eta 0:00:40 lr 0.000023 wd 0.0500 time 0.6001 (0.7361) data time 0.0008 (0.0117) model time 0.5993 (0.7244) loss 5.2347 (6.6420) grad_norm 2.7165 (2.8890) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 08:17:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][580/625] eta 0:00:32 lr 0.000023 wd 0.0500 time 0.6008 (0.7215) data time 0.0008 (0.0106) model time 0.6000 (0.7110) loss 7.6112 (6.6188) grad_norm 3.1581 (2.8436) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 08:18:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][590/625] eta 0:00:24 lr 0.000023 wd 0.0500 time 0.5916 (0.7090) data time 0.0009 (0.0096) model time 0.5907 (0.6994) loss 6.6468 (6.6379) grad_norm 5.9279 (2.8370) loss_scale 512.0000 (278.3689) mem 22344MB +[2024-07-29 08:18:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][600/625] eta 0:00:17 lr 0.000023 wd 0.0500 time 0.5912 (0.6986) data time 0.0010 (0.0089) model time 0.5902 (0.6897) loss 6.4238 (6.6325) grad_norm 2.6671 (2.8330) loss_scale 512.0000 (299.0442) mem 22344MB +[2024-07-29 08:18:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][610/625] eta 0:00:10 lr 0.000023 wd 0.0500 time 0.5936 (0.6899) data time 0.0008 (0.0083) model time 0.5928 (0.6817) loss 6.5777 (6.6371) grad_norm 2.9603 (2.7998) loss_scale 512.0000 (316.3577) mem 22344MB +[2024-07-29 08:18:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [282/300][620/625] eta 0:00:03 lr 0.000023 wd 0.0500 time 0.5990 (0.6825) data time 0.0008 (0.0077) model time 0.5982 (0.6748) loss 7.2943 (6.6442) grad_norm 7.4902 (2.8541) loss_scale 512.0000 (331.0677) mem 22344MB +[2024-07-29 08:18:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 282 training takes 0:01:33 +[2024-07-29 08:18:21 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 08:18:26 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 08:18:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.519 (0.519) Loss 0.4907 (0.4907) Acc@1 90.430 (90.430) Acc@5 98.975 (98.975) Mem 22344MB +[2024-07-29 08:18:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.127 (0.163) Loss 0.7344 (0.5914) Acc@1 83.154 (88.255) Acc@5 97.266 (98.162) Mem 22344MB +[2024-07-29 08:18:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.145) Loss 0.8115 (0.6773) Acc@1 80.859 (85.684) Acc@5 96.338 (97.414) Mem 22344MB +[2024-07-29 08:18:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.347 Acc@5 97.403 +[2024-07-29 08:18:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.3% +[2024-07-29 08:18:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 85.35% +[2024-07-29 08:18:32 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-29 08:18:35 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-29 08:18:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.517 (0.517) Loss 0.4927 (0.4927) Acc@1 90.479 (90.479) Acc@5 99.023 (99.023) Mem 22344MB +[2024-07-29 08:18:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.161) Loss 0.7334 (0.5954) Acc@1 83.252 (88.259) Acc@5 97.363 (98.167) Mem 22344MB +[2024-07-29 08:18:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.145) Loss 0.8135 (0.6795) Acc@1 81.104 (85.638) Acc@5 96.387 (97.407) Mem 22344MB +[2024-07-29 08:18:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.249 Acc@5 97.401 +[2024-07-29 08:18:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.2% +[2024-07-29 08:18:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.25% +[2024-07-29 08:18:38 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 08:18:41 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 08:18:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][0/625] eta 0:11:49 lr 0.000023 wd 0.0500 time 1.1347 (1.1347) data time 0.4246 (0.4246) model time 0.0000 (0.0000) loss 6.8472 (6.8472) grad_norm 2.4752 (2.4752) loss_scale 512.0000 (512.0000) mem 22337MB +[2024-07-29 08:18:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][10/625] eta 0:06:34 lr 0.000023 wd 0.0500 time 0.5936 (0.6420) data time 0.0008 (0.0395) model time 0.0000 (0.0000) loss 6.6654 (6.5187) grad_norm 2.1297 (2.4781) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:18:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][20/625] eta 0:06:14 lr 0.000023 wd 0.0500 time 0.5950 (0.6184) data time 0.0009 (0.0212) model time 0.0000 (0.0000) loss 5.8570 (6.4799) grad_norm 4.1113 (3.5640) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:19:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][30/625] eta 0:06:05 lr 0.000023 wd 0.0500 time 0.6979 (0.6135) data time 0.0010 (0.0147) model time 0.0000 (0.0000) loss 7.6314 (6.6066) grad_norm 3.1617 (3.4074) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:19:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][40/625] eta 0:05:57 lr 0.000023 wd 0.0500 time 0.5950 (0.6106) data time 0.0008 (0.0131) model time 0.0000 (0.0000) loss 5.7198 (6.5777) grad_norm 2.6785 (3.2567) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:19:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][50/625] eta 0:05:52 lr 0.000023 wd 0.0500 time 0.5925 (0.6123) data time 0.0008 (0.0107) model time 0.0000 (0.0000) loss 7.0455 (6.6104) grad_norm 2.8427 (3.1179) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:19:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][60/625] eta 0:05:44 lr 0.000023 wd 0.0500 time 0.6035 (0.6105) data time 0.0011 (0.0091) model time 0.6024 (0.6004) loss 5.4087 (6.5856) grad_norm 3.0077 (3.5942) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:19:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][70/625] eta 0:05:37 lr 0.000023 wd 0.0500 time 0.5922 (0.6084) data time 0.0011 (0.0081) model time 0.5911 (0.5970) loss 6.9937 (6.5358) grad_norm 2.2287 (3.5095) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:19:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][80/625] eta 0:05:32 lr 0.000023 wd 0.0500 time 0.5994 (0.6094) data time 0.0011 (0.0073) model time 0.5984 (0.6029) loss 6.0806 (6.5280) grad_norm 2.0663 (3.4472) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:19:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][90/625] eta 0:05:25 lr 0.000023 wd 0.0500 time 0.6142 (0.6086) data time 0.0011 (0.0066) model time 0.6132 (0.6023) loss 8.0002 (6.5770) grad_norm 3.1073 (3.3631) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:19:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][100/625] eta 0:05:18 lr 0.000023 wd 0.0500 time 0.6105 (0.6074) data time 0.0009 (0.0061) model time 0.6096 (0.6010) loss 7.7401 (6.5658) grad_norm 1.8271 (3.2816) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:19:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][110/625] eta 0:05:12 lr 0.000023 wd 0.0500 time 0.5952 (0.6066) data time 0.0012 (0.0057) model time 0.5941 (0.6002) loss 5.9707 (6.5486) grad_norm 2.0212 (3.1919) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:19:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][120/625] eta 0:05:05 lr 0.000023 wd 0.0500 time 0.5968 (0.6055) data time 0.0011 (0.0053) model time 0.5957 (0.5991) loss 6.5056 (6.5151) grad_norm 2.2271 (3.1231) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:20:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][130/625] eta 0:04:59 lr 0.000023 wd 0.0500 time 0.5981 (0.6047) data time 0.0008 (0.0050) model time 0.5973 (0.5985) loss 6.4050 (6.4941) grad_norm 2.7479 (3.1079) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:20:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][140/625] eta 0:04:53 lr 0.000022 wd 0.0500 time 0.5927 (0.6055) data time 0.0012 (0.0047) model time 0.5915 (0.6003) loss 5.0211 (6.4918) grad_norm 2.2892 (3.0639) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:20:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][150/625] eta 0:04:47 lr 0.000022 wd 0.0500 time 0.5950 (0.6048) data time 0.0008 (0.0045) model time 0.5941 (0.5996) loss 6.9597 (6.5131) grad_norm 2.5562 (3.0912) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:20:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][160/625] eta 0:04:40 lr 0.000022 wd 0.0500 time 0.5954 (0.6041) data time 0.0010 (0.0043) model time 0.5944 (0.5990) loss 5.4452 (6.4885) grad_norm 2.5931 (3.0725) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:20:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][170/625] eta 0:04:34 lr 0.000022 wd 0.0500 time 0.5960 (0.6036) data time 0.0009 (0.0041) model time 0.5951 (0.5986) loss 6.5103 (6.5044) grad_norm 2.6849 (3.0553) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:20:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][180/625] eta 0:04:28 lr 0.000022 wd 0.0500 time 0.5996 (0.6031) data time 0.0012 (0.0039) model time 0.5984 (0.5982) loss 6.3294 (6.5324) grad_norm 3.2378 (3.0446) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:20:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][190/625] eta 0:04:22 lr 0.000022 wd 0.0500 time 0.5947 (0.6026) data time 0.0009 (0.0038) model time 0.5939 (0.5979) loss 6.4517 (6.5460) grad_norm 3.2602 (3.0635) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:20:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][200/625] eta 0:04:15 lr 0.000022 wd 0.0500 time 0.5930 (0.6022) data time 0.0011 (0.0036) model time 0.5919 (0.5975) loss 6.9046 (6.5483) grad_norm 9.1440 (3.0664) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:20:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][210/625] eta 0:04:09 lr 0.000022 wd 0.0500 time 0.5977 (0.6017) data time 0.0009 (0.0035) model time 0.5969 (0.5971) loss 6.8141 (6.5569) grad_norm 2.7750 (3.0709) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:20:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][220/625] eta 0:04:03 lr 0.000022 wd 0.0500 time 0.5925 (0.6014) data time 0.0010 (0.0034) model time 0.5915 (0.5969) loss 7.2623 (6.5582) grad_norm 2.7774 (3.0949) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:21:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][230/625] eta 0:03:57 lr 0.000022 wd 0.0500 time 0.5934 (0.6012) data time 0.0009 (0.0033) model time 0.5926 (0.5968) loss 5.6362 (6.5515) grad_norm 2.3242 (3.0823) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:21:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][240/625] eta 0:03:51 lr 0.000022 wd 0.0500 time 0.5961 (0.6009) data time 0.0011 (0.0032) model time 0.5950 (0.5966) loss 7.1538 (6.5620) grad_norm 2.4360 (3.1166) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:21:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][250/625] eta 0:03:45 lr 0.000022 wd 0.0500 time 0.5948 (0.6006) data time 0.0008 (0.0031) model time 0.5940 (0.5964) loss 6.0862 (6.5477) grad_norm 2.6904 (3.0999) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:21:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][260/625] eta 0:03:39 lr 0.000022 wd 0.0500 time 0.5414 (0.6010) data time 0.0009 (0.0030) model time 0.5405 (0.5971) loss 6.4944 (6.5531) grad_norm 2.8368 (3.0753) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:21:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][270/625] eta 0:03:33 lr 0.000022 wd 0.0500 time 0.5928 (0.6007) data time 0.0011 (0.0030) model time 0.5917 (0.5969) loss 7.4086 (6.5565) grad_norm 2.8290 (3.0550) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:21:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][280/625] eta 0:03:27 lr 0.000022 wd 0.0500 time 0.5961 (0.6004) data time 0.0008 (0.0029) model time 0.5953 (0.5966) loss 7.0734 (6.5514) grad_norm 3.6549 (3.0322) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:21:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][290/625] eta 0:03:21 lr 0.000022 wd 0.0500 time 0.5934 (0.6002) data time 0.0008 (0.0028) model time 0.5926 (0.5964) loss 7.8175 (6.5605) grad_norm 3.2222 (3.0394) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:21:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][300/625] eta 0:03:14 lr 0.000022 wd 0.0500 time 0.5920 (0.5999) data time 0.0010 (0.0028) model time 0.5910 (0.5962) loss 5.9551 (6.5674) grad_norm 1.9296 (3.0171) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:21:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][310/625] eta 0:03:08 lr 0.000022 wd 0.0500 time 0.5972 (0.5997) data time 0.0012 (0.0027) model time 0.5960 (0.5961) loss 5.9930 (6.5570) grad_norm 2.2537 (3.0051) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:21:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][320/625] eta 0:03:02 lr 0.000022 wd 0.0500 time 0.5943 (0.5996) data time 0.0008 (0.0027) model time 0.5935 (0.5960) loss 6.7107 (6.5510) grad_norm 3.2724 (2.9922) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:22:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][330/625] eta 0:02:56 lr 0.000022 wd 0.0500 time 0.6043 (0.5994) data time 0.0011 (0.0026) model time 0.6032 (0.5959) loss 5.8406 (6.5418) grad_norm 2.7074 (2.9856) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:22:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][340/625] eta 0:02:50 lr 0.000022 wd 0.0500 time 0.5944 (0.5993) data time 0.0010 (0.0026) model time 0.5934 (0.5958) loss 6.2192 (6.5342) grad_norm 2.1916 (2.9729) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:22:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][350/625] eta 0:02:44 lr 0.000022 wd 0.0500 time 0.5927 (0.5991) data time 0.0011 (0.0025) model time 0.5916 (0.5957) loss 7.4481 (6.5501) grad_norm 2.6651 (2.9710) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:22:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][360/625] eta 0:02:38 lr 0.000022 wd 0.0500 time 0.5892 (0.5994) data time 0.0009 (0.0025) model time 0.5883 (0.5961) loss 7.2234 (6.5535) grad_norm 2.0941 (2.9591) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:22:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][370/625] eta 0:02:32 lr 0.000022 wd 0.0500 time 0.6001 (0.5992) data time 0.0007 (0.0025) model time 0.5994 (0.5960) loss 7.3889 (6.5608) grad_norm 1.9859 (2.9551) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:22:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][380/625] eta 0:02:26 lr 0.000022 wd 0.0500 time 0.5964 (0.5991) data time 0.0010 (0.0024) model time 0.5954 (0.5959) loss 6.4774 (6.5704) grad_norm 2.0792 (2.9394) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:22:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][390/625] eta 0:02:20 lr 0.000022 wd 0.0500 time 0.5960 (0.5990) data time 0.0008 (0.0024) model time 0.5952 (0.5959) loss 5.1074 (6.5616) grad_norm 2.0569 (2.9211) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:22:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][400/625] eta 0:02:14 lr 0.000022 wd 0.0500 time 0.5905 (0.5988) data time 0.0008 (0.0023) model time 0.5897 (0.5957) loss 6.7547 (6.5594) grad_norm 2.7395 (2.9211) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:22:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][410/625] eta 0:02:08 lr 0.000022 wd 0.0500 time 0.5954 (0.5987) data time 0.0008 (0.0023) model time 0.5946 (0.5956) loss 6.6874 (6.5583) grad_norm 2.1480 (2.9088) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:22:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][420/625] eta 0:02:02 lr 0.000022 wd 0.0500 time 0.5929 (0.5986) data time 0.0010 (0.0023) model time 0.5919 (0.5956) loss 6.7170 (6.5632) grad_norm 2.7797 (2.9133) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:22:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][430/625] eta 0:01:56 lr 0.000022 wd 0.0500 time 0.5920 (0.5985) data time 0.0009 (0.0023) model time 0.5912 (0.5955) loss 6.9326 (6.5717) grad_norm 1.7706 (2.9083) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:23:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][440/625] eta 0:01:50 lr 0.000022 wd 0.0500 time 0.5964 (0.5984) data time 0.0010 (0.0022) model time 0.5954 (0.5954) loss 7.4174 (6.5738) grad_norm 4.4160 (2.9059) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:23:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][450/625] eta 0:01:44 lr 0.000022 wd 0.0500 time 0.5981 (0.5983) data time 0.0010 (0.0022) model time 0.5971 (0.5954) loss 7.6436 (6.5807) grad_norm 3.4367 (2.9107) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:23:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][460/625] eta 0:01:38 lr 0.000022 wd 0.0500 time 0.5937 (0.5982) data time 0.0010 (0.0022) model time 0.5927 (0.5954) loss 5.7795 (6.5781) grad_norm 2.4958 (2.9224) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:23:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][470/625] eta 0:01:32 lr 0.000022 wd 0.0500 time 0.6023 (0.5982) data time 0.0011 (0.0022) model time 0.6012 (0.5953) loss 7.0644 (6.5722) grad_norm 1.7697 (2.9211) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:23:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][480/625] eta 0:01:26 lr 0.000022 wd 0.0500 time 0.5959 (0.5981) data time 0.0010 (0.0021) model time 0.5949 (0.5953) loss 6.7657 (6.5787) grad_norm 2.6374 (2.9216) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:23:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 08:23:31 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 08:23:32 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 08:37:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 08:37:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 08:37:40 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 08:39:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 08:39:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 08:40:16 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 08:40:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 08:40:28 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 08:40:28 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 08:40:28 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 08:40:29 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 283) +[2024-07-29 08:40:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 08:40:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][490/625] eta 0:04:47 lr 0.000022 wd 0.0500 time 0.5812 (2.1271) data time 0.0014 (0.1147) model time 0.5798 (2.0124) loss 7.0442 (7.2428) grad_norm 2.7002 (2.6964) loss_scale 512.0000 (512.0000) mem 22346MB +[2024-07-29 08:40:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][500/625] eta 0:02:32 lr 0.000022 wd 0.0500 time 0.5836 (1.2193) data time 0.0010 (0.0478) model time 0.5825 (1.1714) loss 5.9291 (6.9055) grad_norm 3.0426 (2.5872) loss_scale 512.0000 (512.0000) mem 22346MB +[2024-07-29 08:41:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][510/625] eta 0:01:54 lr 0.000022 wd 0.0500 time 0.5842 (0.9955) data time 0.0008 (0.0305) model time 0.5834 (0.9650) loss 8.1595 (6.9276) grad_norm 3.0482 (2.5911) loss_scale 512.0000 (512.0000) mem 22346MB +[2024-07-29 08:41:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][520/625] eta 0:01:32 lr 0.000022 wd 0.0500 time 0.5868 (0.8847) data time 0.0011 (0.0225) model time 0.5858 (0.8622) loss 6.4732 (6.8992) grad_norm 2.0103 (2.6407) loss_scale 512.0000 (512.0000) mem 22346MB +[2024-07-29 08:41:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][530/625] eta 0:01:17 lr 0.000022 wd 0.0500 time 0.5833 (0.8210) data time 0.0008 (0.0180) model time 0.5825 (0.8031) loss 7.3313 (6.8038) grad_norm 3.6552 (2.7007) loss_scale 512.0000 (512.0000) mem 22346MB +[2024-07-29 08:41:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][540/625] eta 0:01:06 lr 0.000022 wd 0.0500 time 0.5865 (0.7879) data time 0.0011 (0.0150) model time 0.5854 (0.7729) loss 7.1979 (6.7517) grad_norm 2.4470 (2.7005) loss_scale 512.0000 (512.0000) mem 22346MB +[2024-07-29 08:41:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][550/625] eta 0:00:56 lr 0.000022 wd 0.0500 time 0.5857 (0.7581) data time 0.0011 (0.0129) model time 0.5846 (0.7452) loss 6.3865 (6.6812) grad_norm 2.4022 (2.6652) loss_scale 512.0000 (512.0000) mem 22346MB +[2024-07-29 08:41:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][560/625] eta 0:00:47 lr 0.000022 wd 0.0500 time 0.5896 (0.7363) data time 0.0011 (0.0114) model time 0.5885 (0.7249) loss 6.4188 (6.6584) grad_norm 3.7507 (2.6273) loss_scale 512.0000 (512.0000) mem 22346MB +[2024-07-29 08:41:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][570/625] eta 0:00:39 lr 0.000022 wd 0.0500 time 0.5885 (0.7196) data time 0.0011 (0.0102) model time 0.5874 (0.7094) loss 7.1845 (6.6277) grad_norm 3.1712 (2.6122) loss_scale 512.0000 (512.0000) mem 22346MB +[2024-07-29 08:41:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][580/625] eta 0:00:31 lr 0.000022 wd 0.0500 time 0.5868 (0.7061) data time 0.0011 (0.0092) model time 0.5857 (0.6969) loss 7.2337 (6.6289) grad_norm 3.3199 (2.6303) loss_scale 512.0000 (512.0000) mem 22346MB +[2024-07-29 08:41:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][590/625] eta 0:00:24 lr 0.000022 wd 0.0500 time 0.5897 (0.6953) data time 0.0008 (0.0085) model time 0.5889 (0.6868) loss 6.4549 (6.6344) grad_norm 1.6450 (2.6855) loss_scale 512.0000 (512.0000) mem 22346MB +[2024-07-29 08:41:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][600/625] eta 0:00:17 lr 0.000022 wd 0.0500 time 0.5892 (0.6863) data time 0.0011 (0.0079) model time 0.5882 (0.6785) loss 7.2114 (6.6144) grad_norm 2.3942 (2.7463) loss_scale 512.0000 (512.0000) mem 22346MB +[2024-07-29 08:41:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][610/625] eta 0:00:10 lr 0.000022 wd 0.0500 time 0.5896 (0.6789) data time 0.0008 (0.0074) model time 0.5888 (0.6715) loss 7.6468 (6.6122) grad_norm 2.2614 (2.8073) loss_scale 512.0000 (512.0000) mem 22346MB +[2024-07-29 08:42:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [283/300][620/625] eta 0:00:03 lr 0.000022 wd 0.0500 time 0.5845 (0.6723) data time 0.0006 (0.0069) model time 0.5839 (0.6655) loss 6.1489 (6.6231) grad_norm 1.8597 (2.7994) loss_scale 512.0000 (512.0000) mem 22346MB +[2024-07-29 08:42:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 283 training takes 0:01:34 +[2024-07-29 08:42:08 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 08:42:14 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 08:42:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.522 (0.522) Loss 0.4883 (0.4883) Acc@1 90.527 (90.527) Acc@5 98.926 (98.926) Mem 22346MB +[2024-07-29 08:42:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.162) Loss 0.7358 (0.5906) Acc@1 83.203 (88.259) Acc@5 97.363 (98.167) Mem 22346MB +[2024-07-29 08:42:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.151) Loss 0.8105 (0.6763) Acc@1 81.299 (85.661) Acc@5 96.143 (97.400) Mem 22346MB +[2024-07-29 08:42:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.277 Acc@5 97.389 +[2024-07-29 08:42:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.3% +[2024-07-29 08:42:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.807 (0.807) Loss 0.4924 (0.4924) Acc@1 90.527 (90.527) Acc@5 99.023 (99.023) Mem 22346MB +[2024-07-29 08:42:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.188) Loss 0.7329 (0.5949) Acc@1 83.203 (88.268) Acc@5 97.412 (98.167) Mem 22346MB +[2024-07-29 08:42:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.158) Loss 0.8135 (0.6789) Acc@1 81.104 (85.647) Acc@5 96.387 (97.410) Mem 22346MB +[2024-07-29 08:42:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.261 Acc@5 97.403 +[2024-07-29 08:42:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.3% +[2024-07-29 08:42:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.26% +[2024-07-29 08:42:24 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 08:42:27 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 08:42:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][0/625] eta 0:11:24 lr 0.000022 wd 0.0500 time 1.0948 (1.0948) data time 0.3802 (0.3802) model time 0.0000 (0.0000) loss 6.4586 (6.4586) grad_norm 2.4103 (2.4103) loss_scale 512.0000 (512.0000) mem 22337MB +[2024-07-29 08:42:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][10/625] eta 0:06:31 lr 0.000022 wd 0.0500 time 0.5903 (0.6364) data time 0.0010 (0.0355) model time 0.0000 (0.0000) loss 6.7284 (6.4607) grad_norm 2.6188 (3.0496) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:42:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][20/625] eta 0:06:11 lr 0.000022 wd 0.0500 time 0.5932 (0.6141) data time 0.0011 (0.0191) model time 0.0000 (0.0000) loss 6.9838 (6.6005) grad_norm 1.8129 (2.6874) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:42:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][30/625] eta 0:06:00 lr 0.000021 wd 0.0500 time 0.5900 (0.6063) data time 0.0010 (0.0134) model time 0.0000 (0.0000) loss 6.5432 (6.6535) grad_norm 2.1009 (2.8931) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:42:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][40/625] eta 0:05:52 lr 0.000021 wd 0.0500 time 0.5926 (0.6030) data time 0.0011 (0.0104) model time 0.0000 (0.0000) loss 6.2322 (6.5480) grad_norm 2.4417 (2.8259) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:42:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][50/625] eta 0:05:45 lr 0.000021 wd 0.0500 time 0.5999 (0.6008) data time 0.0010 (0.0085) model time 0.0000 (0.0000) loss 7.2715 (6.5403) grad_norm 3.9322 (2.8087) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:43:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][60/625] eta 0:05:38 lr 0.000021 wd 0.0500 time 0.5933 (0.5993) data time 0.0008 (0.0073) model time 0.5925 (0.5906) loss 6.6638 (6.4855) grad_norm 2.1657 (2.8099) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:43:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][70/625] eta 0:05:32 lr 0.000021 wd 0.0500 time 0.5872 (0.5994) data time 0.0012 (0.0064) model time 0.5860 (0.5949) loss 6.3618 (6.4423) grad_norm 3.1744 (2.8564) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:43:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][80/625] eta 0:05:26 lr 0.000021 wd 0.0500 time 0.5911 (0.5986) data time 0.0010 (0.0058) model time 0.5901 (0.5937) loss 6.7424 (6.4435) grad_norm 2.1896 (2.7952) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:43:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][90/625] eta 0:05:19 lr 0.000021 wd 0.0500 time 0.5931 (0.5979) data time 0.0011 (0.0053) model time 0.5921 (0.5931) loss 7.3039 (6.4684) grad_norm 3.1161 (2.8020) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:43:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][100/625] eta 0:05:13 lr 0.000021 wd 0.0500 time 0.5930 (0.5972) data time 0.0009 (0.0048) model time 0.5921 (0.5925) loss 7.7615 (6.4776) grad_norm 2.5476 (2.7588) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:43:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][110/625] eta 0:05:07 lr 0.000021 wd 0.0500 time 0.5898 (0.5966) data time 0.0008 (0.0045) model time 0.5889 (0.5920) loss 7.2367 (6.4585) grad_norm 2.5125 (2.7339) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:43:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][120/625] eta 0:05:01 lr 0.000021 wd 0.0500 time 0.5908 (0.5960) data time 0.0010 (0.0042) model time 0.5897 (0.5915) loss 6.2288 (6.4442) grad_norm 2.4984 (2.9029) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:43:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][130/625] eta 0:04:54 lr 0.000021 wd 0.0500 time 0.5937 (0.5956) data time 0.0011 (0.0040) model time 0.5926 (0.5912) loss 6.0815 (6.4257) grad_norm 2.8740 (2.8607) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:43:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][140/625] eta 0:04:49 lr 0.000021 wd 0.0500 time 0.5944 (0.5968) data time 0.0012 (0.0038) model time 0.5932 (0.5935) loss 5.9086 (6.4434) grad_norm 2.8017 (2.8216) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:43:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][150/625] eta 0:04:43 lr 0.000021 wd 0.0500 time 0.5899 (0.5964) data time 0.0010 (0.0036) model time 0.5888 (0.5931) loss 6.6804 (6.4373) grad_norm 2.2233 (2.8138) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:44:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][160/625] eta 0:04:37 lr 0.000021 wd 0.0500 time 0.5907 (0.5960) data time 0.0011 (0.0034) model time 0.5896 (0.5926) loss 6.7839 (6.4285) grad_norm 2.5261 (2.9284) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:44:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][170/625] eta 0:04:30 lr 0.000021 wd 0.0500 time 0.5888 (0.5956) data time 0.0011 (0.0033) model time 0.5877 (0.5923) loss 7.2886 (6.4513) grad_norm 2.7846 (2.9046) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:44:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][180/625] eta 0:04:24 lr 0.000021 wd 0.0500 time 0.5947 (0.5952) data time 0.0011 (0.0032) model time 0.5936 (0.5920) loss 7.3648 (6.4947) grad_norm 2.0173 (2.8833) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:44:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][190/625] eta 0:04:18 lr 0.000021 wd 0.0500 time 0.5900 (0.5950) data time 0.0010 (0.0031) model time 0.5890 (0.5918) loss 7.6125 (6.5060) grad_norm 3.4880 (2.8796) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:44:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][200/625] eta 0:04:12 lr 0.000021 wd 0.0500 time 0.5908 (0.5947) data time 0.0009 (0.0030) model time 0.5899 (0.5916) loss 6.2688 (6.5081) grad_norm 1.8115 (2.8816) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:44:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][210/625] eta 0:04:06 lr 0.000021 wd 0.0500 time 0.5877 (0.5944) data time 0.0008 (0.0029) model time 0.5869 (0.5913) loss 7.5542 (6.5115) grad_norm 3.0703 (2.9363) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:44:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][220/625] eta 0:04:00 lr 0.000021 wd 0.0500 time 0.5896 (0.5942) data time 0.0009 (0.0028) model time 0.5888 (0.5911) loss 7.0975 (6.5233) grad_norm 2.6165 (2.9548) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:44:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][230/625] eta 0:03:54 lr 0.000021 wd 0.0500 time 0.5875 (0.5940) data time 0.0011 (0.0027) model time 0.5864 (0.5910) loss 6.5556 (6.5291) grad_norm 2.5106 (2.9767) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:44:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][240/625] eta 0:03:48 lr 0.000021 wd 0.0500 time 0.5887 (0.5937) data time 0.0009 (0.0027) model time 0.5878 (0.5908) loss 6.8968 (6.5226) grad_norm 2.3352 (2.9699) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:44:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][250/625] eta 0:03:42 lr 0.000021 wd 0.0500 time 0.5898 (0.5935) data time 0.0008 (0.0026) model time 0.5890 (0.5906) loss 6.4124 (6.5118) grad_norm 2.5213 (2.9458) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:45:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][260/625] eta 0:03:36 lr 0.000021 wd 0.0500 time 0.5877 (0.5933) data time 0.0008 (0.0025) model time 0.5869 (0.5904) loss 7.4595 (6.5305) grad_norm 2.3324 (2.9565) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:45:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][270/625] eta 0:03:30 lr 0.000021 wd 0.0500 time 0.5888 (0.5931) data time 0.0010 (0.0025) model time 0.5878 (0.5902) loss 6.6732 (6.5344) grad_norm 2.2567 (2.9439) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:45:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][280/625] eta 0:03:24 lr 0.000021 wd 0.0500 time 0.6040 (0.5930) data time 0.0008 (0.0024) model time 0.6032 (0.5903) loss 6.9669 (6.5379) grad_norm 2.2189 (2.9291) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:45:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][290/625] eta 0:03:18 lr 0.000021 wd 0.0500 time 0.5981 (0.5932) data time 0.0008 (0.0024) model time 0.5973 (0.5906) loss 5.8134 (6.5384) grad_norm 3.1948 (2.9178) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:45:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][300/625] eta 0:03:12 lr 0.000021 wd 0.0500 time 0.5886 (0.5932) data time 0.0009 (0.0023) model time 0.5878 (0.5906) loss 5.4509 (6.5360) grad_norm 1.8335 (2.9057) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:45:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][310/625] eta 0:03:06 lr 0.000021 wd 0.0500 time 0.5913 (0.5931) data time 0.0011 (0.0023) model time 0.5902 (0.5905) loss 6.0830 (6.5330) grad_norm 2.0011 (2.8843) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:45:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][320/625] eta 0:03:00 lr 0.000021 wd 0.0500 time 0.5957 (0.5929) data time 0.0008 (0.0022) model time 0.5949 (0.5904) loss 5.6954 (6.5238) grad_norm 2.2639 (2.9074) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:45:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][330/625] eta 0:02:54 lr 0.000021 wd 0.0500 time 0.5953 (0.5928) data time 0.0011 (0.0022) model time 0.5943 (0.5903) loss 6.6665 (6.5181) grad_norm 2.2328 (2.9332) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:45:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][340/625] eta 0:02:48 lr 0.000021 wd 0.0500 time 0.5900 (0.5928) data time 0.0008 (0.0022) model time 0.5892 (0.5903) loss 6.9289 (6.5141) grad_norm 2.5888 (2.9629) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:45:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][350/625] eta 0:02:43 lr 0.000021 wd 0.0500 time 0.5993 (0.5928) data time 0.0010 (0.0021) model time 0.5983 (0.5903) loss 7.4654 (6.5280) grad_norm 4.1086 (2.9736) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:46:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][360/625] eta 0:02:37 lr 0.000021 wd 0.0500 time 0.5938 (0.5933) data time 0.0011 (0.0021) model time 0.5928 (0.5909) loss 7.2266 (6.5221) grad_norm 2.8534 (2.9661) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:46:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][370/625] eta 0:02:31 lr 0.000021 wd 0.0500 time 0.5893 (0.5932) data time 0.0008 (0.0021) model time 0.5885 (0.5909) loss 6.1034 (6.5291) grad_norm 2.0428 (2.9480) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:46:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][380/625] eta 0:02:25 lr 0.000021 wd 0.0500 time 0.5925 (0.5933) data time 0.0008 (0.0021) model time 0.5917 (0.5910) loss 5.4622 (6.5272) grad_norm 2.5520 (2.9396) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:46:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][390/625] eta 0:02:19 lr 0.000021 wd 0.0500 time 0.5918 (0.5932) data time 0.0010 (0.0021) model time 0.5908 (0.5910) loss 7.7630 (6.5295) grad_norm 2.3658 (2.9318) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:46:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][400/625] eta 0:02:13 lr 0.000021 wd 0.0500 time 0.5935 (0.5932) data time 0.0008 (0.0020) model time 0.5927 (0.5910) loss 5.8706 (6.5242) grad_norm 2.1244 (2.9238) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:46:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][410/625] eta 0:02:07 lr 0.000021 wd 0.0500 time 0.5963 (0.5932) data time 0.0008 (0.0020) model time 0.5955 (0.5910) loss 6.0692 (6.5217) grad_norm 2.9314 (2.9161) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:46:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][420/625] eta 0:02:01 lr 0.000021 wd 0.0500 time 0.5921 (0.5931) data time 0.0008 (0.0020) model time 0.5912 (0.5910) loss 7.7445 (6.5342) grad_norm 1.9188 (2.8976) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:46:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][430/625] eta 0:01:55 lr 0.000021 wd 0.0500 time 0.5911 (0.5931) data time 0.0010 (0.0020) model time 0.5900 (0.5909) loss 5.7223 (6.5358) grad_norm 1.9722 (2.8915) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:46:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][440/625] eta 0:01:49 lr 0.000021 wd 0.0500 time 0.5906 (0.5934) data time 0.0009 (0.0020) model time 0.5896 (0.5913) loss 6.1978 (6.5399) grad_norm 5.0363 (2.8839) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:46:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][450/625] eta 0:01:43 lr 0.000021 wd 0.0500 time 0.5991 (0.5934) data time 0.0010 (0.0020) model time 0.5981 (0.5914) loss 7.6986 (6.5499) grad_norm 2.9158 (2.8806) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:47:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][460/625] eta 0:01:37 lr 0.000021 wd 0.0500 time 0.5881 (0.5934) data time 0.0008 (0.0019) model time 0.5872 (0.5913) loss 6.2171 (6.5502) grad_norm 3.3116 (2.8950) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:47:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][470/625] eta 0:01:31 lr 0.000021 wd 0.0500 time 0.6137 (0.5934) data time 0.0011 (0.0019) model time 0.6126 (0.5914) loss 7.0316 (6.5477) grad_norm 2.1920 (2.8888) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:47:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][480/625] eta 0:01:26 lr 0.000021 wd 0.0500 time 0.5977 (0.5934) data time 0.0012 (0.0019) model time 0.5965 (0.5914) loss 6.8363 (6.5525) grad_norm 2.1393 (2.9002) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:47:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][490/625] eta 0:01:20 lr 0.000021 wd 0.0500 time 0.5899 (0.5933) data time 0.0012 (0.0019) model time 0.5888 (0.5913) loss 6.5778 (6.5534) grad_norm 2.3394 (2.9256) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:47:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][500/625] eta 0:01:14 lr 0.000021 wd 0.0500 time 0.6257 (0.5934) data time 0.0011 (0.0019) model time 0.6247 (0.5914) loss 5.6364 (6.5533) grad_norm 2.9379 (2.9203) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:47:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][510/625] eta 0:01:08 lr 0.000021 wd 0.0500 time 0.5915 (0.5937) data time 0.0012 (0.0019) model time 0.5904 (0.5918) loss 5.5797 (6.5494) grad_norm 2.8786 (2.9206) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:47:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][520/625] eta 0:01:02 lr 0.000021 wd 0.0500 time 0.5934 (0.5936) data time 0.0011 (0.0018) model time 0.5924 (0.5918) loss 6.9314 (6.5442) grad_norm 2.4509 (2.9175) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:47:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][530/625] eta 0:00:56 lr 0.000021 wd 0.0500 time 0.6118 (0.5937) data time 0.0010 (0.0018) model time 0.6108 (0.5919) loss 6.0696 (6.5445) grad_norm 1.7912 (2.9157) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:47:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][540/625] eta 0:00:50 lr 0.000021 wd 0.0500 time 0.5887 (0.5937) data time 0.0011 (0.0018) model time 0.5876 (0.5919) loss 7.0932 (6.5457) grad_norm 2.1134 (2.9109) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:47:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][550/625] eta 0:00:44 lr 0.000021 wd 0.0500 time 0.5887 (0.5937) data time 0.0010 (0.0018) model time 0.5877 (0.5919) loss 6.9122 (6.5465) grad_norm 2.6711 (2.9114) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:48:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][560/625] eta 0:00:38 lr 0.000021 wd 0.0500 time 0.6054 (0.5936) data time 0.0008 (0.0018) model time 0.6047 (0.5918) loss 5.8290 (6.5418) grad_norm 2.8473 (2.9028) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:48:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][570/625] eta 0:00:32 lr 0.000020 wd 0.0500 time 0.5888 (0.5936) data time 0.0008 (0.0018) model time 0.5880 (0.5918) loss 5.9286 (6.5414) grad_norm 2.0952 (2.9088) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:48:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][580/625] eta 0:00:26 lr 0.000020 wd 0.0500 time 0.5869 (0.5939) data time 0.0009 (0.0018) model time 0.5860 (0.5921) loss 5.8857 (6.5396) grad_norm 2.4870 (2.9082) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:48:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][590/625] eta 0:00:20 lr 0.000020 wd 0.0500 time 0.5909 (0.5938) data time 0.0010 (0.0017) model time 0.5899 (0.5921) loss 7.2724 (6.5427) grad_norm 3.2552 (2.9088) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:48:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][600/625] eta 0:00:14 lr 0.000020 wd 0.0500 time 0.5879 (0.5938) data time 0.0012 (0.0017) model time 0.5868 (0.5921) loss 6.7319 (6.5499) grad_norm 3.2906 (2.9085) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:48:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][610/625] eta 0:00:08 lr 0.000020 wd 0.0500 time 0.5916 (0.5937) data time 0.0006 (0.0017) model time 0.5910 (0.5920) loss 7.4333 (6.5501) grad_norm 6.9163 (2.9195) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:48:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [284/300][620/625] eta 0:00:02 lr 0.000020 wd 0.0500 time 0.6007 (0.5937) data time 0.0005 (0.0017) model time 0.6001 (0.5919) loss 7.0973 (6.5488) grad_norm 3.7228 (2.9204) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:48:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 284 training takes 0:06:11 +[2024-07-29 08:48:38 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 08:48:40 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 08:48:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.511 (0.511) Loss 0.4912 (0.4912) Acc@1 90.625 (90.625) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-29 08:48:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.127 (0.162) Loss 0.7412 (0.5932) Acc@1 83.057 (88.281) Acc@5 97.363 (98.216) Mem 22339MB +[2024-07-29 08:48:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.145) Loss 0.8105 (0.6780) Acc@1 81.348 (85.731) Acc@5 96.289 (97.447) Mem 22339MB +[2024-07-29 08:48:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.357 Acc@5 97.435 +[2024-07-29 08:48:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.4% +[2024-07-29 08:48:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 85.36% +[2024-07-29 08:48:43 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-29 08:48:47 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-29 08:48:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.577 (0.577) Loss 0.4919 (0.4919) Acc@1 90.479 (90.479) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-29 08:48:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.167) Loss 0.7329 (0.5945) Acc@1 83.203 (88.263) Acc@5 97.412 (98.167) Mem 22339MB +[2024-07-29 08:48:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.148) Loss 0.8125 (0.6785) Acc@1 81.152 (85.640) Acc@5 96.387 (97.417) Mem 22339MB +[2024-07-29 08:48:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.253 Acc@5 97.409 +[2024-07-29 08:48:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.3% +[2024-07-29 08:48:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][0/625] eta 0:14:53 lr 0.000020 wd 0.0500 time 1.4303 (1.4303) data time 0.5913 (0.5913) model time 0.0000 (0.0000) loss 5.5389 (5.5389) grad_norm 3.0088 (3.0088) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:48:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][10/625] eta 0:06:49 lr 0.000020 wd 0.0500 time 0.5864 (0.6652) data time 0.0010 (0.0547) model time 0.0000 (0.0000) loss 6.1684 (6.6897) grad_norm 3.8092 (3.0920) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:49:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][20/625] eta 0:06:20 lr 0.000020 wd 0.0500 time 0.6008 (0.6296) data time 0.0008 (0.0291) model time 0.0000 (0.0000) loss 6.9318 (6.6714) grad_norm 2.5664 (2.8749) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:49:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][30/625] eta 0:06:07 lr 0.000020 wd 0.0500 time 0.5925 (0.6172) data time 0.0008 (0.0201) model time 0.0000 (0.0000) loss 7.0580 (6.6431) grad_norm 2.5417 (2.8402) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:49:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][40/625] eta 0:05:58 lr 0.000020 wd 0.0500 time 0.5902 (0.6128) data time 0.0008 (0.0154) model time 0.0000 (0.0000) loss 6.1229 (6.6012) grad_norm 2.7976 (2.7994) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:49:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][50/625] eta 0:05:49 lr 0.000020 wd 0.0500 time 0.5903 (0.6083) data time 0.0010 (0.0126) model time 0.0000 (0.0000) loss 6.9464 (6.6000) grad_norm 2.0002 (2.7411) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:49:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][60/625] eta 0:05:42 lr 0.000020 wd 0.0500 time 0.5876 (0.6055) data time 0.0008 (0.0107) model time 0.5869 (0.5899) loss 5.6954 (6.5392) grad_norm 2.5136 (2.8020) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:49:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][70/625] eta 0:05:34 lr 0.000020 wd 0.0500 time 0.5935 (0.6034) data time 0.0010 (0.0093) model time 0.5925 (0.5899) loss 5.2216 (6.5180) grad_norm 2.8758 (2.8880) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:49:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][80/625] eta 0:05:28 lr 0.000020 wd 0.0500 time 0.5979 (0.6019) data time 0.0010 (0.0083) model time 0.5969 (0.5901) loss 6.5335 (6.4780) grad_norm 3.5619 (2.9250) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:49:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][90/625] eta 0:05:21 lr 0.000020 wd 0.0500 time 0.5898 (0.6006) data time 0.0011 (0.0075) model time 0.5887 (0.5897) loss 6.5336 (6.4896) grad_norm 1.9764 (2.9305) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:49:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][100/625] eta 0:05:14 lr 0.000020 wd 0.0500 time 0.5947 (0.5996) data time 0.0008 (0.0068) model time 0.5939 (0.5896) loss 5.9231 (6.5056) grad_norm 2.7864 (2.9199) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:49:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][110/625] eta 0:05:08 lr 0.000020 wd 0.0500 time 0.5911 (0.5988) data time 0.0008 (0.0063) model time 0.5903 (0.5896) loss 5.1534 (6.4877) grad_norm 2.6291 (2.9926) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:50:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][120/625] eta 0:05:02 lr 0.000020 wd 0.0500 time 0.5887 (0.5980) data time 0.0007 (0.0059) model time 0.5880 (0.5895) loss 6.5691 (6.4987) grad_norm 2.4794 (2.9765) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:50:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][130/625] eta 0:04:55 lr 0.000020 wd 0.0500 time 0.5946 (0.5974) data time 0.0008 (0.0055) model time 0.5938 (0.5894) loss 6.4049 (6.5096) grad_norm 2.4591 (2.9902) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:50:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][140/625] eta 0:04:49 lr 0.000020 wd 0.0500 time 0.5892 (0.5969) data time 0.0009 (0.0052) model time 0.5884 (0.5893) loss 5.2615 (6.5334) grad_norm 2.3907 (2.9738) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:50:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][150/625] eta 0:04:43 lr 0.000020 wd 0.0500 time 0.5905 (0.5964) data time 0.0011 (0.0049) model time 0.5893 (0.5893) loss 5.8597 (6.5239) grad_norm 3.7704 (2.9587) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:50:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][160/625] eta 0:04:37 lr 0.000020 wd 0.0500 time 0.5957 (0.5960) data time 0.0011 (0.0047) model time 0.5946 (0.5893) loss 6.8554 (6.5515) grad_norm 2.4194 (2.9154) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:50:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][170/625] eta 0:04:31 lr 0.000020 wd 0.0500 time 0.8017 (0.5978) data time 0.0010 (0.0045) model time 0.8006 (0.5923) loss 8.3769 (6.5473) grad_norm 8.0011 (2.9267) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:50:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][180/625] eta 0:04:25 lr 0.000020 wd 0.0500 time 0.5908 (0.5974) data time 0.0008 (0.0043) model time 0.5900 (0.5921) loss 5.8795 (6.5346) grad_norm 2.1354 (2.9291) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:50:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][190/625] eta 0:04:19 lr 0.000020 wd 0.0500 time 0.5912 (0.5972) data time 0.0010 (0.0041) model time 0.5902 (0.5921) loss 6.1063 (6.5233) grad_norm 2.4241 (2.9236) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:50:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][200/625] eta 0:04:13 lr 0.000020 wd 0.0500 time 0.5907 (0.5969) data time 0.0011 (0.0039) model time 0.5896 (0.5921) loss 6.7387 (6.5240) grad_norm 2.8008 (2.9508) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 08:50:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 08:50:56 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 08:50:57 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 08:55:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 08:55:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 08:55:59 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 08:56:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 08:56:15 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 08:56:15 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 08:56:15 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 08:56:15 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 285) +[2024-07-29 08:56:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 08:56:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][210/625] eta 1:10:02 lr 0.000020 wd 0.0500 time 10.1258 (10.1258) data time 0.8744 (0.8744) model time 9.2514 (9.2514) loss 7.6416 (7.6416) grad_norm 2.0647 (2.0647) loss_scale 512.0000 (512.0000) mem 26016MB +[2024-07-29 08:56:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][220/625] eta 0:10:52 lr 0.000020 wd 0.0500 time 0.6058 (1.6111) data time 0.0010 (0.0806) model time 0.6048 (1.5305) loss 5.8310 (6.9023) grad_norm 2.2702 (2.5597) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-29 08:56:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][230/625] eta 0:07:27 lr 0.000020 wd 0.0500 time 0.6050 (1.1327) data time 0.0010 (0.0427) model time 0.6039 (1.0900) loss 7.0830 (6.7710) grad_norm 2.3293 (2.5815) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-29 08:56:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][240/625] eta 0:06:10 lr 0.000020 wd 0.0500 time 0.6012 (0.9616) data time 0.0009 (0.0293) model time 0.6003 (0.9323) loss 5.3590 (6.7090) grad_norm 1.9724 (2.7750) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-29 08:56:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][250/625] eta 0:05:27 lr 0.000020 wd 0.0500 time 0.6015 (0.8738) data time 0.0011 (0.0224) model time 0.6005 (0.8513) loss 7.0607 (6.6898) grad_norm 1.7698 (2.9360) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-29 08:57:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][260/625] eta 0:05:01 lr 0.000020 wd 0.0500 time 0.8157 (0.8249) data time 0.0009 (0.0182) model time 0.8148 (0.8067) loss 6.9071 (6.6651) grad_norm 2.2082 (2.8648) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-29 08:57:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][270/625] eta 0:04:40 lr 0.000020 wd 0.0500 time 0.6066 (0.7913) data time 0.0011 (0.0154) model time 0.6055 (0.7759) loss 5.8469 (6.6266) grad_norm 7.1096 (2.9553) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-29 08:57:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][280/625] eta 0:04:24 lr 0.000020 wd 0.0500 time 0.6084 (0.7658) data time 0.0010 (0.0134) model time 0.6074 (0.7524) loss 6.6338 (6.6100) grad_norm 2.3223 (2.9049) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-29 08:57:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][290/625] eta 0:04:10 lr 0.000020 wd 0.0500 time 0.6087 (0.7468) data time 0.0010 (0.0119) model time 0.6076 (0.7349) loss 6.8907 (6.6163) grad_norm 5.6297 (2.9570) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-29 08:57:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][300/625] eta 0:03:58 lr 0.000020 wd 0.0500 time 0.6808 (0.7328) data time 0.0009 (0.0107) model time 0.6799 (0.7222) loss 7.0195 (6.6280) grad_norm 2.8109 (2.9766) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-29 08:57:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][310/625] eta 0:03:47 lr 0.000020 wd 0.0500 time 0.6109 (0.7207) data time 0.0008 (0.0097) model time 0.6101 (0.7110) loss 6.9059 (6.6694) grad_norm 10.0753 (3.0568) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-29 08:57:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][320/625] eta 0:03:36 lr 0.000020 wd 0.0500 time 0.6028 (0.7103) data time 0.0010 (0.0089) model time 0.6018 (0.7013) loss 6.3771 (6.6621) grad_norm 3.2417 (3.0236) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-29 08:57:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][330/625] eta 0:03:27 lr 0.000020 wd 0.0500 time 0.6034 (0.7018) data time 0.0008 (0.0083) model time 0.6026 (0.6935) loss 5.5776 (6.6709) grad_norm 3.0628 (3.0095) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-29 08:57:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][340/625] eta 0:03:17 lr 0.000020 wd 0.0500 time 0.6026 (0.6942) data time 0.0011 (0.0077) model time 0.6015 (0.6865) loss 8.0473 (6.6749) grad_norm 2.3666 (2.9934) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-29 08:57:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][350/625] eta 0:03:09 lr 0.000020 wd 0.0500 time 0.6159 (0.6885) data time 0.0010 (0.0073) model time 0.6149 (0.6812) loss 6.6297 (6.6658) grad_norm 1.9543 (2.9689) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-29 08:58:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][360/625] eta 0:03:01 lr 0.000020 wd 0.0500 time 0.6069 (0.6834) data time 0.0010 (0.0069) model time 0.6059 (0.6765) loss 6.9330 (6.6606) grad_norm 2.2496 (2.9389) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-29 08:58:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][370/625] eta 0:02:53 lr 0.000020 wd 0.0500 time 0.6088 (0.6790) data time 0.0010 (0.0065) model time 0.6078 (0.6725) loss 5.9695 (6.6631) grad_norm 2.3614 (2.9212) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-29 08:58:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][380/625] eta 0:02:45 lr 0.000020 wd 0.0500 time 0.6061 (0.6751) data time 0.0010 (0.0062) model time 0.6051 (0.6689) loss 5.7620 (6.6436) grad_norm 2.3108 (2.8798) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-29 08:58:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][390/625] eta 0:02:37 lr 0.000020 wd 0.0500 time 0.6028 (0.6712) data time 0.0010 (0.0059) model time 0.6018 (0.6653) loss 8.0659 (6.6190) grad_norm 3.3567 (2.8835) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-29 08:58:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][400/625] eta 0:02:30 lr 0.000020 wd 0.0500 time 0.6023 (0.6679) data time 0.0010 (0.0057) model time 0.6013 (0.6622) loss 5.2804 (6.6104) grad_norm 2.0216 (2.8925) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-29 08:58:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][410/625] eta 0:02:22 lr 0.000020 wd 0.0500 time 0.6030 (0.6647) data time 0.0010 (0.0054) model time 0.6020 (0.6593) loss 5.9157 (6.6004) grad_norm 2.2360 (2.8737) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-29 08:58:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][420/625] eta 0:02:15 lr 0.000020 wd 0.0500 time 0.6106 (0.6621) data time 0.0010 (0.0052) model time 0.6096 (0.6568) loss 7.1419 (6.6046) grad_norm 2.4083 (2.8679) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-29 08:58:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][430/625] eta 0:02:08 lr 0.000020 wd 0.0500 time 0.6176 (0.6598) data time 0.0008 (0.0050) model time 0.6168 (0.6547) loss 6.4002 (6.6009) grad_norm 2.2257 (2.9064) loss_scale 512.0000 (512.0000) mem 22344MB +[2024-07-29 08:58:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 08:58:51 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 08:58:55 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 09:04:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 09:04:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 09:07:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 09:07:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 09:08:41 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 09:08:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 09:08:51 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 09:08:52 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 09:08:52 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 09:08:52 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 285) +[2024-07-29 09:08:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 09:09:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][440/625] eta 0:25:10 lr 0.000020 wd 0.0500 time 3.2514 (8.1666) data time 0.0009 (0.3911) model time 3.2505 (7.7756) loss 7.8011 (7.3272) grad_norm 2.3803 (2.4725) loss_scale 512.0000 (512.0000) mem 22342MB +[2024-07-29 09:09:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][450/625] eta 0:05:14 lr 0.000020 wd 0.0500 time 0.5209 (1.7955) data time 0.0006 (0.0659) model time 0.5203 (1.7296) loss 6.3356 (6.7690) grad_norm 2.2326 (4.4305) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:09:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][460/625] eta 0:03:20 lr 0.000020 wd 0.0500 time 0.5219 (1.2153) data time 0.0011 (0.0364) model time 0.5208 (1.1789) loss 6.7620 (6.7740) grad_norm 2.4973 (3.6368) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:09:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][470/625] eta 0:02:34 lr 0.000020 wd 0.0500 time 0.5220 (0.9980) data time 0.0007 (0.0253) model time 0.5212 (0.9727) loss 7.0887 (6.8806) grad_norm 2.6837 (3.3565) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:09:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][480/625] eta 0:02:08 lr 0.000020 wd 0.0500 time 0.5192 (0.8840) data time 0.0009 (0.0195) model time 0.5183 (0.8645) loss 7.7015 (6.8242) grad_norm 3.1426 (3.2021) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:09:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][490/625] eta 0:01:50 lr 0.000020 wd 0.0500 time 0.5175 (0.8180) data time 0.0007 (0.0159) model time 0.5168 (0.8020) loss 5.9625 (6.7323) grad_norm 2.4292 (3.1397) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:09:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][500/625] eta 0:01:36 lr 0.000020 wd 0.0500 time 0.5216 (0.7740) data time 0.0006 (0.0135) model time 0.5210 (0.7606) loss 6.7818 (6.7090) grad_norm 2.8319 (3.0458) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:09:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][510/625] eta 0:01:24 lr 0.000020 wd 0.0500 time 0.5178 (0.7385) data time 0.0011 (0.0118) model time 0.5166 (0.7267) loss 6.2994 (6.6650) grad_norm 2.3688 (3.1090) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:09:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][520/625] eta 0:01:14 lr 0.000019 wd 0.0500 time 0.5209 (0.7121) data time 0.0009 (0.0104) model time 0.5200 (0.7017) loss 6.5242 (6.6851) grad_norm 2.1228 (3.0552) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:10:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][530/625] eta 0:01:05 lr 0.000019 wd 0.0500 time 0.5241 (0.6915) data time 0.0007 (0.0094) model time 0.5234 (0.6821) loss 6.2042 (6.6655) grad_norm 2.6011 (3.0296) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:10:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][540/625] eta 0:00:57 lr 0.000019 wd 0.0500 time 0.5170 (0.6756) data time 0.0006 (0.0086) model time 0.5164 (0.6670) loss 7.4116 (6.7003) grad_norm 2.7411 (3.0284) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:10:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][550/625] eta 0:00:49 lr 0.000019 wd 0.0500 time 0.5366 (0.6621) data time 0.0009 (0.0079) model time 0.5357 (0.6542) loss 7.8238 (6.6967) grad_norm 2.5865 (2.9707) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:10:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][560/625] eta 0:00:42 lr 0.000019 wd 0.0500 time 0.5176 (0.6504) data time 0.0007 (0.0073) model time 0.5169 (0.6431) loss 7.6220 (6.6974) grad_norm 2.7543 (3.0335) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:10:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][570/625] eta 0:00:35 lr 0.000019 wd 0.0500 time 0.5224 (0.6409) data time 0.0009 (0.0068) model time 0.5215 (0.6340) loss 5.9454 (6.6722) grad_norm 3.0353 (3.0441) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:10:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][580/625] eta 0:00:28 lr 0.000019 wd 0.0500 time 0.5221 (0.6331) data time 0.0010 (0.0065) model time 0.5211 (0.6266) loss 6.9298 (6.6600) grad_norm 3.1796 (3.0285) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:10:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][590/625] eta 0:00:21 lr 0.000019 wd 0.0500 time 0.5151 (0.6257) data time 0.0014 (0.0061) model time 0.5137 (0.6196) loss 7.3779 (6.6594) grad_norm 2.4111 (3.0047) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:10:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][600/625] eta 0:00:15 lr 0.000019 wd 0.0500 time 0.5228 (0.6191) data time 0.0009 (0.0058) model time 0.5220 (0.6133) loss 7.5428 (6.6629) grad_norm 3.4118 (2.9778) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:10:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][610/625] eta 0:00:09 lr 0.000019 wd 0.0500 time 0.5276 (0.6134) data time 0.0006 (0.0055) model time 0.5270 (0.6078) loss 5.7170 (6.6475) grad_norm 1.9085 (3.0655) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:10:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [285/300][620/625] eta 0:00:03 lr 0.000019 wd 0.0500 time 0.5384 (0.6082) data time 0.0006 (0.0053) model time 0.5378 (0.6029) loss 6.8333 (6.6369) grad_norm 3.5461 (3.0896) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:10:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 285 training takes 0:01:52 +[2024-07-29 09:10:51 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 09:10:54 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 09:10:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.571 (0.571) Loss 0.4863 (0.4863) Acc@1 90.430 (90.430) Acc@5 99.072 (99.072) Mem 22341MB +[2024-07-29 09:10:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.159) Loss 0.7358 (0.5874) Acc@1 83.105 (88.290) Acc@5 97.217 (98.184) Mem 22341MB +[2024-07-29 09:10:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.139) Loss 0.8066 (0.6731) Acc@1 81.445 (85.728) Acc@5 96.338 (97.428) Mem 22341MB +[2024-07-29 09:11:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.377 Acc@5 97.411 +[2024-07-29 09:11:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.4% +[2024-07-29 09:11:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 85.38% +[2024-07-29 09:11:00 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-29 09:11:02 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-29 09:11:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.593 (0.593) Loss 0.4912 (0.4912) Acc@1 90.479 (90.479) Acc@5 99.023 (99.023) Mem 22341MB +[2024-07-29 09:11:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.162) Loss 0.7329 (0.5941) Acc@1 83.203 (88.263) Acc@5 97.412 (98.167) Mem 22341MB +[2024-07-29 09:11:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.140) Loss 0.8120 (0.6781) Acc@1 81.104 (85.631) Acc@5 96.387 (97.419) Mem 22341MB +[2024-07-29 09:11:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.253 Acc@5 97.411 +[2024-07-29 09:11:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.3% +[2024-07-29 09:11:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.25% +[2024-07-29 09:11:06 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 09:11:11 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 09:11:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][0/625] eta 0:13:28 lr 0.000019 wd 0.0500 time 1.2940 (1.2940) data time 0.5747 (0.5747) model time 0.0000 (0.0000) loss 6.4096 (6.4096) grad_norm 2.6731 (2.6731) loss_scale 512.0000 (512.0000) mem 22337MB +[2024-07-29 09:11:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][10/625] eta 0:06:04 lr 0.000019 wd 0.0500 time 0.5201 (0.5923) data time 0.0007 (0.0531) model time 0.0000 (0.0000) loss 6.9777 (6.2884) grad_norm 2.0634 (2.6058) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:11:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][20/625] eta 0:05:37 lr 0.000019 wd 0.0500 time 0.5186 (0.5574) data time 0.0007 (0.0282) model time 0.0000 (0.0000) loss 6.6359 (6.4317) grad_norm 3.9656 (2.5944) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:11:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][30/625] eta 0:05:25 lr 0.000019 wd 0.0500 time 0.5171 (0.5464) data time 0.0013 (0.0194) model time 0.0000 (0.0000) loss 6.8285 (6.4567) grad_norm 1.8156 (2.5551) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:11:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][40/625] eta 0:05:16 lr 0.000019 wd 0.0500 time 0.5248 (0.5411) data time 0.0009 (0.0149) model time 0.0000 (0.0000) loss 6.8637 (6.4880) grad_norm 5.1981 (2.5959) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:11:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][50/625] eta 0:05:08 lr 0.000019 wd 0.0500 time 0.5180 (0.5366) data time 0.0007 (0.0121) model time 0.0000 (0.0000) loss 6.3690 (6.5088) grad_norm 3.4902 (2.8069) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:11:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][60/625] eta 0:05:01 lr 0.000019 wd 0.0500 time 0.5224 (0.5340) data time 0.0008 (0.0103) model time 0.5216 (0.5195) loss 6.9092 (6.5140) grad_norm 2.0498 (2.7946) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:11:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][70/625] eta 0:04:55 lr 0.000019 wd 0.0500 time 0.5207 (0.5322) data time 0.0009 (0.0090) model time 0.5198 (0.5199) loss 5.6437 (6.4861) grad_norm 19.3529 (3.5648) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:11:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][80/625] eta 0:04:49 lr 0.000019 wd 0.0500 time 0.5170 (0.5306) data time 0.0008 (0.0080) model time 0.5162 (0.5194) loss 5.4856 (6.4513) grad_norm 2.7559 (3.7283) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:12:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][90/625] eta 0:04:44 lr 0.000019 wd 0.0500 time 0.5180 (0.5321) data time 0.0008 (0.0072) model time 0.5172 (0.5254) loss 6.9568 (6.4964) grad_norm 4.8512 (3.6828) loss_scale 1024.0000 (562.6374) mem 22339MB +[2024-07-29 09:12:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][100/625] eta 0:04:38 lr 0.000019 wd 0.0500 time 0.5169 (0.5308) data time 0.0006 (0.0066) model time 0.5162 (0.5240) loss 7.6750 (6.5168) grad_norm 2.0312 (3.6906) loss_scale 1024.0000 (608.3168) mem 22339MB +[2024-07-29 09:12:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][110/625] eta 0:04:32 lr 0.000019 wd 0.0500 time 0.5279 (0.5300) data time 0.0009 (0.0061) model time 0.5270 (0.5233) loss 5.5473 (6.4914) grad_norm 2.4275 (3.6261) loss_scale 1024.0000 (645.7658) mem 22339MB +[2024-07-29 09:12:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][120/625] eta 0:04:27 lr 0.000019 wd 0.0500 time 0.5176 (0.5292) data time 0.0007 (0.0057) model time 0.5169 (0.5229) loss 7.0341 (6.4962) grad_norm 2.8447 (3.5947) loss_scale 1024.0000 (677.0248) mem 22339MB +[2024-07-29 09:12:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][130/625] eta 0:04:21 lr 0.000019 wd 0.0500 time 0.5188 (0.5287) data time 0.0008 (0.0053) model time 0.5180 (0.5227) loss 7.8507 (6.5378) grad_norm 2.6389 (3.5972) loss_scale 1024.0000 (703.5115) mem 22339MB +[2024-07-29 09:12:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][140/625] eta 0:04:16 lr 0.000019 wd 0.0500 time 0.5173 (0.5281) data time 0.0007 (0.0050) model time 0.5167 (0.5224) loss 6.5440 (6.5664) grad_norm 2.8275 (3.5272) loss_scale 1024.0000 (726.2411) mem 22339MB +[2024-07-29 09:12:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][150/625] eta 0:04:10 lr 0.000019 wd 0.0500 time 0.5227 (0.5277) data time 0.0009 (0.0048) model time 0.5218 (0.5221) loss 5.1723 (6.5626) grad_norm 2.7316 (3.4661) loss_scale 1024.0000 (745.9603) mem 22339MB +[2024-07-29 09:12:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][160/625] eta 0:04:05 lr 0.000019 wd 0.0500 time 0.5179 (0.5271) data time 0.0007 (0.0045) model time 0.5172 (0.5217) loss 5.7344 (6.5800) grad_norm 2.2311 (3.4139) loss_scale 1024.0000 (763.2298) mem 22339MB +[2024-07-29 09:12:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][170/625] eta 0:03:59 lr 0.000019 wd 0.0500 time 0.5178 (0.5266) data time 0.0009 (0.0043) model time 0.5170 (0.5214) loss 6.1444 (6.5702) grad_norm 1.9717 (3.3555) loss_scale 1024.0000 (778.4795) mem 22339MB +[2024-07-29 09:12:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][180/625] eta 0:03:54 lr 0.000019 wd 0.0500 time 0.5339 (0.5263) data time 0.0006 (0.0041) model time 0.5333 (0.5212) loss 6.3493 (6.5657) grad_norm 2.3892 (inf) loss_scale 512.0000 (783.5580) mem 22339MB +[2024-07-29 09:12:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][190/625] eta 0:03:48 lr 0.000019 wd 0.0500 time 0.5164 (0.5258) data time 0.0009 (0.0040) model time 0.5155 (0.5209) loss 6.2065 (6.5676) grad_norm 1.8300 (inf) loss_scale 512.0000 (769.3403) mem 22339MB +[2024-07-29 09:12:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][200/625] eta 0:03:43 lr 0.000019 wd 0.0500 time 0.5176 (0.5256) data time 0.0006 (0.0038) model time 0.5169 (0.5208) loss 6.8278 (6.5569) grad_norm 3.4364 (inf) loss_scale 512.0000 (756.5373) mem 22339MB +[2024-07-29 09:13:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][210/625] eta 0:03:38 lr 0.000019 wd 0.0500 time 0.5175 (0.5253) data time 0.0008 (0.0037) model time 0.5167 (0.5208) loss 6.1240 (6.5580) grad_norm 2.6231 (inf) loss_scale 512.0000 (744.9479) mem 22339MB +[2024-07-29 09:13:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][220/625] eta 0:03:32 lr 0.000019 wd 0.0500 time 0.5193 (0.5250) data time 0.0009 (0.0035) model time 0.5184 (0.5206) loss 6.8789 (6.5640) grad_norm 3.5542 (inf) loss_scale 512.0000 (734.4072) mem 22339MB +[2024-07-29 09:13:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][230/625] eta 0:03:27 lr 0.000019 wd 0.0500 time 0.5214 (0.5257) data time 0.0006 (0.0034) model time 0.5207 (0.5216) loss 6.0684 (6.5574) grad_norm 4.3703 (inf) loss_scale 512.0000 (724.7792) mem 22339MB +[2024-07-29 09:13:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][240/625] eta 0:03:22 lr 0.000019 wd 0.0500 time 0.5200 (0.5254) data time 0.0008 (0.0033) model time 0.5192 (0.5214) loss 6.9324 (6.5534) grad_norm 2.4870 (inf) loss_scale 512.0000 (715.9502) mem 22339MB +[2024-07-29 09:13:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][250/625] eta 0:03:16 lr 0.000019 wd 0.0500 time 0.5168 (0.5251) data time 0.0009 (0.0032) model time 0.5159 (0.5212) loss 7.0476 (6.5597) grad_norm 2.2362 (inf) loss_scale 512.0000 (707.8247) mem 22339MB +[2024-07-29 09:13:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][260/625] eta 0:03:11 lr 0.000019 wd 0.0500 time 0.5329 (0.5249) data time 0.0009 (0.0031) model time 0.5320 (0.5211) loss 6.6046 (6.5590) grad_norm 3.2777 (inf) loss_scale 512.0000 (700.3218) mem 22339MB +[2024-07-29 09:13:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][270/625] eta 0:03:06 lr 0.000019 wd 0.0500 time 0.5179 (0.5248) data time 0.0008 (0.0030) model time 0.5170 (0.5211) loss 6.0664 (6.5497) grad_norm 2.8714 (inf) loss_scale 512.0000 (693.3727) mem 22339MB +[2024-07-29 09:13:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][280/625] eta 0:03:00 lr 0.000019 wd 0.0500 time 0.5154 (0.5245) data time 0.0008 (0.0030) model time 0.5146 (0.5209) loss 6.2002 (6.5397) grad_norm 3.7066 (inf) loss_scale 512.0000 (686.9181) mem 22339MB +[2024-07-29 09:13:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][290/625] eta 0:02:55 lr 0.000019 wd 0.0500 time 0.5210 (0.5243) data time 0.0007 (0.0029) model time 0.5203 (0.5208) loss 7.2435 (6.5404) grad_norm 2.2232 (inf) loss_scale 512.0000 (680.9072) mem 22339MB +[2024-07-29 09:13:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][300/625] eta 0:02:50 lr 0.000019 wd 0.0500 time 0.5181 (0.5241) data time 0.0009 (0.0028) model time 0.5172 (0.5207) loss 7.0136 (6.5486) grad_norm 1.9574 (inf) loss_scale 512.0000 (675.2957) mem 22339MB +[2024-07-29 09:13:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][310/625] eta 0:02:45 lr 0.000019 wd 0.0500 time 0.5180 (0.5246) data time 0.0007 (0.0028) model time 0.5173 (0.5213) loss 7.3765 (6.5514) grad_norm 3.2834 (inf) loss_scale 512.0000 (670.0450) mem 22339MB +[2024-07-29 09:14:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][320/625] eta 0:02:40 lr 0.000019 wd 0.0500 time 0.5264 (0.5246) data time 0.0009 (0.0027) model time 0.5255 (0.5214) loss 6.7662 (6.5518) grad_norm 2.7351 (inf) loss_scale 512.0000 (665.1215) mem 22339MB +[2024-07-29 09:14:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][330/625] eta 0:02:34 lr 0.000019 wd 0.0500 time 0.5310 (0.5247) data time 0.0008 (0.0027) model time 0.5301 (0.5215) loss 6.7577 (6.5610) grad_norm 1.8713 (inf) loss_scale 512.0000 (660.4955) mem 22339MB +[2024-07-29 09:14:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][340/625] eta 0:02:29 lr 0.000019 wd 0.0500 time 0.5203 (0.5245) data time 0.0009 (0.0026) model time 0.5194 (0.5215) loss 6.7024 (6.5506) grad_norm 1.9010 (inf) loss_scale 512.0000 (656.1408) mem 22339MB +[2024-07-29 09:14:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][350/625] eta 0:02:24 lr 0.000019 wd 0.0500 time 0.6325 (0.5248) data time 0.0007 (0.0026) model time 0.6318 (0.5218) loss 5.3478 (6.5430) grad_norm 2.8966 (inf) loss_scale 512.0000 (652.0342) mem 22339MB +[2024-07-29 09:14:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][360/625] eta 0:02:19 lr 0.000019 wd 0.0500 time 0.5177 (0.5247) data time 0.0006 (0.0025) model time 0.5170 (0.5218) loss 6.4326 (6.5467) grad_norm 2.0086 (inf) loss_scale 512.0000 (648.1551) mem 22339MB +[2024-07-29 09:14:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][370/625] eta 0:02:13 lr 0.000019 wd 0.0500 time 0.5194 (0.5246) data time 0.0007 (0.0025) model time 0.5187 (0.5217) loss 5.2741 (6.5450) grad_norm 2.8002 (inf) loss_scale 512.0000 (644.4852) mem 22339MB +[2024-07-29 09:14:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][380/625] eta 0:02:08 lr 0.000019 wd 0.0500 time 0.5186 (0.5244) data time 0.0009 (0.0025) model time 0.5177 (0.5216) loss 7.8230 (6.5575) grad_norm 2.0810 (inf) loss_scale 512.0000 (641.0079) mem 22339MB +[2024-07-29 09:14:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][390/625] eta 0:02:03 lr 0.000019 wd 0.0500 time 0.5667 (0.5245) data time 0.0006 (0.0024) model time 0.5661 (0.5218) loss 6.6906 (6.5572) grad_norm 2.2105 (inf) loss_scale 512.0000 (637.7084) mem 22339MB +[2024-07-29 09:14:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][400/625] eta 0:01:58 lr 0.000019 wd 0.0500 time 0.5625 (0.5245) data time 0.0006 (0.0024) model time 0.5619 (0.5218) loss 6.9415 (6.5621) grad_norm 2.6488 (inf) loss_scale 512.0000 (634.5736) mem 22339MB +[2024-07-29 09:14:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][410/625] eta 0:01:52 lr 0.000019 wd 0.0500 time 0.6390 (0.5247) data time 0.0007 (0.0024) model time 0.6383 (0.5220) loss 6.2769 (6.5682) grad_norm 2.3315 (inf) loss_scale 512.0000 (631.5912) mem 22339MB +[2024-07-29 09:14:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][420/625] eta 0:01:47 lr 0.000019 wd 0.0500 time 0.5185 (0.5249) data time 0.0009 (0.0023) model time 0.5176 (0.5224) loss 8.0553 (6.5715) grad_norm 2.4373 (inf) loss_scale 512.0000 (628.7506) mem 22339MB +[2024-07-29 09:14:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][430/625] eta 0:01:42 lr 0.000019 wd 0.0500 time 0.5180 (0.5248) data time 0.0009 (0.0023) model time 0.5171 (0.5223) loss 7.7779 (6.5750) grad_norm 2.5308 (inf) loss_scale 512.0000 (626.0418) mem 22339MB +[2024-07-29 09:15:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][440/625] eta 0:01:37 lr 0.000019 wd 0.0500 time 0.5170 (0.5252) data time 0.0007 (0.0024) model time 0.5163 (0.5226) loss 6.5080 (6.5760) grad_norm 1.9866 (inf) loss_scale 512.0000 (623.4558) mem 22339MB +[2024-07-29 09:15:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][450/625] eta 0:01:32 lr 0.000019 wd 0.0500 time 0.5159 (0.5258) data time 0.0008 (0.0024) model time 0.5151 (0.5233) loss 6.0918 (6.5753) grad_norm 2.5828 (inf) loss_scale 512.0000 (620.9845) mem 22339MB +[2024-07-29 09:15:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][460/625] eta 0:01:26 lr 0.000019 wd 0.0500 time 0.5187 (0.5257) data time 0.0007 (0.0024) model time 0.5180 (0.5232) loss 6.2302 (6.5724) grad_norm 2.3213 (inf) loss_scale 512.0000 (618.6204) mem 22339MB +[2024-07-29 09:15:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][470/625] eta 0:01:21 lr 0.000019 wd 0.0500 time 0.5183 (0.5255) data time 0.0009 (0.0023) model time 0.5174 (0.5230) loss 5.1638 (6.5623) grad_norm 2.1255 (inf) loss_scale 512.0000 (616.3567) mem 22339MB +[2024-07-29 09:15:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][480/625] eta 0:01:16 lr 0.000019 wd 0.0500 time 0.5172 (0.5254) data time 0.0010 (0.0023) model time 0.5162 (0.5230) loss 6.4085 (6.5592) grad_norm 2.4181 (inf) loss_scale 512.0000 (614.1871) mem 22339MB +[2024-07-29 09:15:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][490/625] eta 0:01:10 lr 0.000019 wd 0.0500 time 0.5156 (0.5254) data time 0.0007 (0.0023) model time 0.5149 (0.5229) loss 6.1533 (6.5632) grad_norm 2.2166 (inf) loss_scale 512.0000 (612.1059) mem 22339MB +[2024-07-29 09:15:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][500/625] eta 0:01:05 lr 0.000019 wd 0.0500 time 0.5175 (0.5253) data time 0.0009 (0.0023) model time 0.5166 (0.5229) loss 7.1075 (6.5627) grad_norm 2.3110 (inf) loss_scale 512.0000 (610.1078) mem 22339MB +[2024-07-29 09:15:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][510/625] eta 0:01:00 lr 0.000018 wd 0.0500 time 0.5195 (0.5252) data time 0.0009 (0.0022) model time 0.5186 (0.5228) loss 5.5574 (6.5574) grad_norm 2.4187 (inf) loss_scale 512.0000 (608.1879) mem 22339MB +[2024-07-29 09:15:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][520/625] eta 0:00:55 lr 0.000018 wd 0.0500 time 0.5184 (0.5251) data time 0.0007 (0.0022) model time 0.5177 (0.5227) loss 5.7376 (6.5595) grad_norm 2.4149 (inf) loss_scale 512.0000 (606.3417) mem 22339MB +[2024-07-29 09:15:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][530/625] eta 0:00:49 lr 0.000018 wd 0.0500 time 0.5145 (0.5253) data time 0.0009 (0.0022) model time 0.5136 (0.5230) loss 5.5657 (6.5527) grad_norm 3.2327 (inf) loss_scale 512.0000 (604.5650) mem 22339MB +[2024-07-29 09:15:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][540/625] eta 0:00:44 lr 0.000018 wd 0.0500 time 0.5246 (0.5252) data time 0.0007 (0.0022) model time 0.5239 (0.5229) loss 6.5648 (6.5492) grad_norm 3.1185 (inf) loss_scale 512.0000 (602.8540) mem 22339MB +[2024-07-29 09:16:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][550/625] eta 0:00:39 lr 0.000018 wd 0.0500 time 0.5286 (0.5251) data time 0.0007 (0.0021) model time 0.5280 (0.5228) loss 7.1300 (6.5562) grad_norm 2.1752 (inf) loss_scale 512.0000 (601.2051) mem 22339MB +[2024-07-29 09:16:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][560/625] eta 0:00:34 lr 0.000018 wd 0.0500 time 0.6166 (0.5251) data time 0.0008 (0.0021) model time 0.6159 (0.5228) loss 6.9514 (6.5592) grad_norm 2.2483 (inf) loss_scale 512.0000 (599.6150) mem 22339MB +[2024-07-29 09:16:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][570/625] eta 0:00:28 lr 0.000018 wd 0.0500 time 0.5183 (0.5250) data time 0.0009 (0.0021) model time 0.5175 (0.5227) loss 6.8541 (6.5560) grad_norm 2.7932 (inf) loss_scale 512.0000 (598.0806) mem 22339MB +[2024-07-29 09:16:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][580/625] eta 0:00:23 lr 0.000018 wd 0.0500 time 0.5175 (0.5249) data time 0.0009 (0.0021) model time 0.5166 (0.5227) loss 6.2416 (6.5626) grad_norm 2.8286 (inf) loss_scale 512.0000 (596.5990) mem 22339MB +[2024-07-29 09:16:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][590/625] eta 0:00:18 lr 0.000018 wd 0.0500 time 0.5212 (0.5248) data time 0.0009 (0.0021) model time 0.5202 (0.5226) loss 5.5779 (6.5649) grad_norm 2.6586 (inf) loss_scale 512.0000 (595.1675) mem 22339MB +[2024-07-29 09:16:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][600/625] eta 0:00:13 lr 0.000018 wd 0.0500 time 0.5197 (0.5247) data time 0.0007 (0.0020) model time 0.5190 (0.5225) loss 6.8939 (6.5670) grad_norm 2.4637 (inf) loss_scale 512.0000 (593.7837) mem 22339MB +[2024-07-29 09:16:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][610/625] eta 0:00:07 lr 0.000018 wd 0.0500 time 0.5150 (0.5247) data time 0.0004 (0.0020) model time 0.5146 (0.5225) loss 6.5159 (6.5641) grad_norm 2.0199 (inf) loss_scale 512.0000 (592.4452) mem 22339MB +[2024-07-29 09:16:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [286/300][620/625] eta 0:00:02 lr 0.000018 wd 0.0500 time 0.5187 (0.5245) data time 0.0004 (0.0020) model time 0.5183 (0.5223) loss 5.9368 (6.5561) grad_norm 4.5387 (inf) loss_scale 512.0000 (591.1498) mem 22339MB +[2024-07-29 09:16:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 286 training takes 0:05:27 +[2024-07-29 09:16:39 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 09:16:41 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 09:16:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.574 (0.574) Loss 0.4873 (0.4873) Acc@1 90.771 (90.771) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-29 09:16:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.159) Loss 0.7358 (0.5907) Acc@1 83.105 (88.295) Acc@5 97.217 (98.202) Mem 22339MB +[2024-07-29 09:16:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.139) Loss 0.8086 (0.6752) Acc@1 81.543 (85.756) Acc@5 96.289 (97.433) Mem 22339MB +[2024-07-29 09:16:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.399 Acc@5 97.423 +[2024-07-29 09:16:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.4% +[2024-07-29 09:16:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 85.40% +[2024-07-29 09:16:44 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-29 09:16:50 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-29 09:16:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.588 (0.588) Loss 0.4912 (0.4912) Acc@1 90.479 (90.479) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-29 09:16:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.160) Loss 0.7329 (0.5939) Acc@1 83.203 (88.268) Acc@5 97.412 (98.167) Mem 22339MB +[2024-07-29 09:16:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.139) Loss 0.8120 (0.6780) Acc@1 81.201 (85.642) Acc@5 96.387 (97.414) Mem 22339MB +[2024-07-29 09:16:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.267 Acc@5 97.407 +[2024-07-29 09:16:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.3% +[2024-07-29 09:16:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.27% +[2024-07-29 09:16:53 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 09:16:56 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 09:16:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][0/625] eta 0:11:00 lr 0.000018 wd 0.0500 time 1.0568 (1.0568) data time 0.5410 (0.5410) model time 0.0000 (0.0000) loss 6.9841 (6.9841) grad_norm 2.0320 (2.0320) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:17:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][10/625] eta 0:05:49 lr 0.000018 wd 0.0500 time 0.5172 (0.5682) data time 0.0007 (0.0500) model time 0.0000 (0.0000) loss 5.7221 (6.7204) grad_norm 3.0416 (2.5131) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:17:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][20/625] eta 0:05:30 lr 0.000018 wd 0.0500 time 0.5433 (0.5463) data time 0.0008 (0.0268) model time 0.0000 (0.0000) loss 5.9647 (6.5047) grad_norm 2.2983 (2.4410) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:17:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][30/625] eta 0:05:19 lr 0.000018 wd 0.0500 time 0.5192 (0.5375) data time 0.0008 (0.0185) model time 0.0000 (0.0000) loss 5.7916 (6.5337) grad_norm 1.8680 (2.5147) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:17:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 09:17:14 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 09:17:16 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 09:20:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 09:20:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 09:20:54 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 09:21:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 09:21:06 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 09:21:06 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 09:21:06 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 09:21:06 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 287) +[2024-07-29 09:21:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 09:21:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][40/625] eta 0:18:42 lr 0.000018 wd 0.0500 time 0.5769 (1.9194) data time 0.0006 (0.0995) model time 0.0000 (0.0000) loss 6.6584 (6.8288) grad_norm 2.7551 (2.8540) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:21:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][50/625] eta 0:11:13 lr 0.000018 wd 0.0500 time 0.5757 (1.1719) data time 0.0006 (0.0447) model time 0.0000 (0.0000) loss 6.5678 (6.7474) grad_norm 14.4505 (3.5748) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:21:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][60/625] eta 0:09:00 lr 0.000018 wd 0.0500 time 0.5722 (0.9572) data time 0.0008 (0.0291) model time 0.5714 (0.5698) loss 7.6783 (6.7726) grad_norm 3.5629 (3.3153) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:21:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][70/625] eta 0:07:55 lr 0.000018 wd 0.0500 time 0.5711 (0.8560) data time 0.0008 (0.0217) model time 0.5702 (0.5708) loss 6.1626 (6.6891) grad_norm 3.3074 (3.2197) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:21:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][80/625] eta 0:07:14 lr 0.000018 wd 0.0500 time 0.5700 (0.7967) data time 0.0006 (0.0174) model time 0.5694 (0.5707) loss 6.4494 (6.6707) grad_norm 1.8187 (3.0465) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:21:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][90/625] eta 0:06:48 lr 0.000018 wd 0.0500 time 0.5760 (0.7635) data time 0.0006 (0.0145) model time 0.5754 (0.5789) loss 5.4433 (6.6142) grad_norm 2.1411 (2.9540) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:22:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][100/625] eta 0:06:26 lr 0.000018 wd 0.0500 time 0.5788 (0.7359) data time 0.0007 (0.0125) model time 0.5781 (0.5780) loss 5.7022 (6.5501) grad_norm 2.7487 (2.9946) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:22:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][110/625] eta 0:06:08 lr 0.000018 wd 0.0500 time 0.5772 (0.7157) data time 0.0006 (0.0110) model time 0.5766 (0.5780) loss 5.8422 (6.5466) grad_norm 3.9975 (2.9706) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:22:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][120/625] eta 0:05:53 lr 0.000018 wd 0.0500 time 0.5751 (0.6998) data time 0.0008 (0.0099) model time 0.5743 (0.5775) loss 6.9627 (6.5480) grad_norm 2.3147 (3.0130) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:22:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][130/625] eta 0:05:40 lr 0.000018 wd 0.0500 time 0.5722 (0.6875) data time 0.0007 (0.0089) model time 0.5716 (0.5776) loss 6.7706 (6.5607) grad_norm 3.0571 (2.9861) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:22:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][140/625] eta 0:05:28 lr 0.000018 wd 0.0500 time 0.5733 (0.6768) data time 0.0006 (0.0082) model time 0.5727 (0.5769) loss 6.0009 (6.5687) grad_norm 5.4558 (2.9717) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:22:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][150/625] eta 0:05:17 lr 0.000018 wd 0.0500 time 0.5722 (0.6681) data time 0.0009 (0.0076) model time 0.5713 (0.5765) loss 6.1564 (6.5613) grad_norm 2.4368 (2.9379) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:22:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][160/625] eta 0:05:07 lr 0.000018 wd 0.0500 time 0.5706 (0.6605) data time 0.0006 (0.0070) model time 0.5700 (0.5760) loss 6.9197 (6.5766) grad_norm 3.2939 (2.9580) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:22:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][170/625] eta 0:04:57 lr 0.000018 wd 0.0500 time 0.5788 (0.6545) data time 0.0008 (0.0066) model time 0.5780 (0.5760) loss 7.1043 (6.5887) grad_norm 2.0654 (3.0016) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:22:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][180/625] eta 0:04:48 lr 0.000018 wd 0.0500 time 0.5797 (0.6493) data time 0.0006 (0.0062) model time 0.5791 (0.5760) loss 7.0534 (6.5829) grad_norm 3.3146 (2.9662) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:22:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][190/625] eta 0:04:40 lr 0.000018 wd 0.0500 time 0.5747 (0.6446) data time 0.0006 (0.0059) model time 0.5741 (0.5760) loss 6.0402 (6.5987) grad_norm 2.7007 (2.9481) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:22:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][200/625] eta 0:04:32 lr 0.000018 wd 0.0500 time 0.5781 (0.6406) data time 0.0009 (0.0056) model time 0.5771 (0.5759) loss 6.8786 (6.6134) grad_norm 2.4336 (2.9439) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:23:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][210/625] eta 0:04:24 lr 0.000018 wd 0.0500 time 0.5817 (0.6368) data time 0.0006 (0.0053) model time 0.5810 (0.5758) loss 5.6885 (6.6017) grad_norm 3.5851 (2.9249) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:23:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][220/625] eta 0:04:16 lr 0.000018 wd 0.0500 time 0.5735 (0.6335) data time 0.0006 (0.0051) model time 0.5729 (0.5756) loss 6.3187 (6.6010) grad_norm 2.3253 (2.9113) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:23:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][230/625] eta 0:04:09 lr 0.000018 wd 0.0500 time 0.5748 (0.6305) data time 0.0008 (0.0048) model time 0.5740 (0.5754) loss 5.4626 (6.5888) grad_norm 1.9290 (2.8976) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:23:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 09:23:18 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 09:23:23 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 09:25:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 09:25:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 09:26:06 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 09:26:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 09:26:17 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 09:26:18 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 09:26:18 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 09:26:18 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 287) +[2024-07-29 09:26:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 09:26:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][240/625] eta 0:18:43 lr 0.000018 wd 0.0500 time 0.5172 (2.9177) data time 0.0007 (0.1844) model time 0.5165 (2.7333) loss 7.4513 (6.8015) grad_norm 2.5480 (2.4537) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:26:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][250/625] eta 0:08:14 lr 0.000018 wd 0.0500 time 0.5166 (1.3182) data time 0.0009 (0.0621) model time 0.5156 (1.2561) loss 6.9183 (6.7515) grad_norm 2.3699 (2.6197) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:26:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][260/625] eta 0:06:04 lr 0.000018 wd 0.0500 time 0.5168 (0.9985) data time 0.0012 (0.0377) model time 0.5156 (0.9608) loss 7.8474 (6.7894) grad_norm 3.2127 (2.6965) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:26:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][270/625] eta 0:05:05 lr 0.000018 wd 0.0500 time 0.5172 (0.8612) data time 0.0010 (0.0272) model time 0.5162 (0.8340) loss 7.0824 (6.7713) grad_norm 2.9587 (2.8134) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:26:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][280/625] eta 0:04:30 lr 0.000018 wd 0.0500 time 0.5168 (0.7849) data time 0.0010 (0.0214) model time 0.5157 (0.7634) loss 7.2245 (6.7268) grad_norm 3.2677 (2.9942) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:27:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][290/625] eta 0:04:09 lr 0.000018 wd 0.0500 time 0.7623 (0.7449) data time 0.0007 (0.0177) model time 0.7616 (0.7272) loss 5.7348 (6.6830) grad_norm 3.9087 (2.9345) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:27:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][300/625] eta 0:03:50 lr 0.000018 wd 0.0500 time 0.5171 (0.7099) data time 0.0009 (0.0151) model time 0.5162 (0.6948) loss 6.7794 (6.6384) grad_norm 2.6102 (2.8640) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:27:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][310/625] eta 0:03:35 lr 0.000018 wd 0.0500 time 0.5172 (0.6845) data time 0.0009 (0.0132) model time 0.5163 (0.6713) loss 5.4092 (6.5982) grad_norm 2.9170 (2.9067) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:27:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][320/625] eta 0:03:22 lr 0.000018 wd 0.0500 time 0.5173 (0.6651) data time 0.0007 (0.0118) model time 0.5166 (0.6533) loss 6.1812 (6.5909) grad_norm 2.1463 (2.8779) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:27:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][330/625] eta 0:03:11 lr 0.000018 wd 0.0500 time 0.5183 (0.6497) data time 0.0011 (0.0107) model time 0.5172 (0.6390) loss 7.2135 (6.5799) grad_norm 3.1005 (2.8591) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:27:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][340/625] eta 0:03:01 lr 0.000018 wd 0.0500 time 0.5268 (0.6373) data time 0.0010 (0.0097) model time 0.5258 (0.6276) loss 6.7857 (6.5912) grad_norm 2.1506 (2.8096) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:27:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][350/625] eta 0:02:52 lr 0.000018 wd 0.0500 time 0.5162 (0.6271) data time 0.0008 (0.0090) model time 0.5154 (0.6181) loss 5.5433 (6.5687) grad_norm 2.5653 (2.8373) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:27:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][360/625] eta 0:02:43 lr 0.000018 wd 0.0500 time 0.5174 (0.6185) data time 0.0007 (0.0083) model time 0.5167 (0.6102) loss 6.4055 (6.5756) grad_norm 3.4868 (2.8868) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:27:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][370/625] eta 0:02:35 lr 0.000018 wd 0.0500 time 0.5163 (0.6113) data time 0.0007 (0.0078) model time 0.5156 (0.6035) loss 5.8608 (6.5870) grad_norm 2.4973 (3.0182) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:27:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][380/625] eta 0:02:28 lr 0.000018 wd 0.0500 time 0.5187 (0.6050) data time 0.0009 (0.0073) model time 0.5178 (0.5977) loss 7.1354 (6.5793) grad_norm 2.6157 (3.0000) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:27:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][390/625] eta 0:02:20 lr 0.000018 wd 0.0500 time 0.5169 (0.5995) data time 0.0010 (0.0069) model time 0.5159 (0.5926) loss 7.3639 (6.5695) grad_norm 2.7174 (2.9867) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:28:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][400/625] eta 0:02:13 lr 0.000018 wd 0.0500 time 0.5195 (0.5947) data time 0.0009 (0.0065) model time 0.5186 (0.5882) loss 8.2695 (6.5701) grad_norm 4.0088 (2.9979) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:28:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][410/625] eta 0:02:07 lr 0.000018 wd 0.0500 time 0.5192 (0.5907) data time 0.0007 (0.0062) model time 0.5185 (0.5845) loss 6.5379 (6.5632) grad_norm 4.5601 (2.9900) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:28:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][420/625] eta 0:02:00 lr 0.000018 wd 0.0500 time 0.5176 (0.5870) data time 0.0010 (0.0059) model time 0.5167 (0.5810) loss 6.1525 (6.5634) grad_norm 5.6060 (2.9911) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:28:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][430/625] eta 0:01:53 lr 0.000018 wd 0.0500 time 0.5162 (0.5835) data time 0.0010 (0.0057) model time 0.5152 (0.5778) loss 6.1744 (6.5573) grad_norm 1.9799 (2.9756) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:28:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][440/625] eta 0:01:47 lr 0.000018 wd 0.0500 time 0.5182 (0.5803) data time 0.0011 (0.0055) model time 0.5171 (0.5749) loss 6.6710 (6.5434) grad_norm 2.9441 (2.9666) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:28:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][450/625] eta 0:01:41 lr 0.000018 wd 0.0500 time 0.5176 (0.5775) data time 0.0009 (0.0053) model time 0.5167 (0.5723) loss 6.6894 (6.5372) grad_norm 4.2200 (2.9591) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:28:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][460/625] eta 0:01:34 lr 0.000018 wd 0.0500 time 0.5170 (0.5750) data time 0.0007 (0.0051) model time 0.5163 (0.5699) loss 6.4647 (6.5375) grad_norm 2.1080 (2.9586) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:28:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][470/625] eta 0:01:28 lr 0.000018 wd 0.0500 time 0.5159 (0.5726) data time 0.0012 (0.0049) model time 0.5147 (0.5677) loss 6.9356 (6.5276) grad_norm 2.5387 (3.0423) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:28:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][480/625] eta 0:01:22 lr 0.000018 wd 0.0500 time 0.5185 (0.5712) data time 0.0009 (0.0048) model time 0.5176 (0.5664) loss 6.5589 (6.5273) grad_norm 2.6109 (3.0452) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:28:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][490/625] eta 0:01:16 lr 0.000018 wd 0.0500 time 0.5210 (0.5703) data time 0.0007 (0.0047) model time 0.5203 (0.5656) loss 5.4767 (6.5138) grad_norm 2.0561 (3.0668) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:28:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][500/625] eta 0:01:11 lr 0.000018 wd 0.0500 time 0.5160 (0.5684) data time 0.0008 (0.0046) model time 0.5152 (0.5638) loss 5.1600 (6.5121) grad_norm 2.4513 (3.0772) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:29:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][510/625] eta 0:01:05 lr 0.000018 wd 0.0500 time 0.5169 (0.5692) data time 0.0009 (0.0045) model time 0.5159 (0.5647) loss 7.3186 (6.5055) grad_norm 2.3536 (3.0600) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:29:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][520/625] eta 0:00:59 lr 0.000018 wd 0.0500 time 0.5168 (0.5674) data time 0.0012 (0.0043) model time 0.5156 (0.5631) loss 7.6788 (6.5089) grad_norm 2.5106 (3.0444) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:29:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][530/625] eta 0:00:53 lr 0.000018 wd 0.0500 time 0.5211 (0.5665) data time 0.0009 (0.0049) model time 0.5202 (0.5616) loss 6.4069 (6.5033) grad_norm 2.0879 (3.0303) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:29:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][540/625] eta 0:00:48 lr 0.000017 wd 0.0500 time 0.5171 (0.5653) data time 0.0011 (0.0048) model time 0.5160 (0.5606) loss 6.0833 (6.5041) grad_norm 1.9407 (3.0278) loss_scale 512.0000 (512.0000) mem 22343MB +[2024-07-29 09:29:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 09:29:20 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 09:29:25 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 09:33:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 09:33:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 09:33:47 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 09:34:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 09:34:00 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 09:34:00 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 09:34:00 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 09:34:00 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 287) +[2024-07-29 09:34:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 09:35:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 09:35:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 09:40:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 09:40:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 09:41:14 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 09:41:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 09:41:23 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 09:41:24 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 09:41:24 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 09:41:24 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 287) +[2024-07-29 09:41:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 09:41:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][550/625] eta 0:05:03 lr 0.000017 wd 0.0500 time 0.5134 (4.0409) data time 0.0008 (0.1704) model time 0.5126 (3.8706) loss 7.5899 (6.8228) grad_norm 2.1846 (2.8155) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:41:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][560/625] eta 0:01:39 lr 0.000017 wd 0.0500 time 0.5184 (1.5317) data time 0.0008 (0.0494) model time 0.5177 (1.4823) loss 6.8056 (6.7404) grad_norm 2.2982 (2.7243) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:41:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][570/625] eta 0:01:01 lr 0.000017 wd 0.0500 time 0.5192 (1.1102) data time 0.0010 (0.0293) model time 0.5181 (1.0809) loss 5.9818 (6.7846) grad_norm 1.9183 (2.6841) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:42:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][580/625] eta 0:00:42 lr 0.000017 wd 0.0500 time 0.5188 (0.9364) data time 0.0009 (0.0210) model time 0.5179 (0.9155) loss 6.1011 (6.7310) grad_norm 2.4074 (2.5862) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:42:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][590/625] eta 0:00:29 lr 0.000017 wd 0.0500 time 0.5201 (0.8418) data time 0.0008 (0.0165) model time 0.5193 (0.8254) loss 5.9651 (6.6959) grad_norm 1.9979 (2.6446) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:42:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][600/625] eta 0:00:19 lr 0.000017 wd 0.0500 time 0.5193 (0.7862) data time 0.0009 (0.0136) model time 0.5183 (0.7726) loss 6.4934 (6.6883) grad_norm 2.7729 (2.5901) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:42:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][610/625] eta 0:00:11 lr 0.000017 wd 0.0500 time 0.5229 (0.7487) data time 0.0005 (0.0119) model time 0.5223 (0.7368) loss 6.5789 (6.6418) grad_norm 2.0712 (2.6015) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:42:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [287/300][620/625] eta 0:00:03 lr 0.000017 wd 0.0500 time 0.5511 (0.7187) data time 0.0005 (0.0104) model time 0.5506 (0.7083) loss 6.7252 (6.6433) grad_norm 2.5800 (2.6023) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 09:42:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 287 training takes 0:00:55 +[2024-07-29 09:42:24 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 09:42:29 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 09:42:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.581 (0.581) Loss 0.4883 (0.4883) Acc@1 90.527 (90.527) Acc@5 99.023 (99.023) Mem 22341MB +[2024-07-29 09:42:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.160) Loss 0.7354 (0.5906) Acc@1 83.350 (88.321) Acc@5 97.314 (98.189) Mem 22341MB +[2024-07-29 09:42:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.139) Loss 0.8145 (0.6752) Acc@1 81.494 (85.735) Acc@5 96.240 (97.428) Mem 22341MB +[2024-07-29 09:42:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.369 Acc@5 97.409 +[2024-07-29 09:42:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.4% +[2024-07-29 09:42:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 1.131 (1.131) Loss 0.4910 (0.4910) Acc@1 90.479 (90.479) Acc@5 99.023 (99.023) Mem 22341MB +[2024-07-29 09:42:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.210) Loss 0.7334 (0.5936) Acc@1 83.203 (88.263) Acc@5 97.412 (98.180) Mem 22341MB +[2024-07-29 09:42:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.165) Loss 0.8115 (0.6777) Acc@1 81.201 (85.649) Acc@5 96.387 (97.424) Mem 22341MB +[2024-07-29 09:42:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.273 Acc@5 97.415 +[2024-07-29 09:42:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.3% +[2024-07-29 09:42:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.27% +[2024-07-29 09:42:39 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 09:42:42 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 09:42:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][0/625] eta 0:16:40 lr 0.000017 wd 0.0500 time 1.6004 (1.6004) data time 0.4874 (0.4874) model time 0.0000 (0.0000) loss 6.0911 (6.0911) grad_norm 2.6881 (2.6881) loss_scale 512.0000 (512.0000) mem 22337MB +[2024-07-29 09:42:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][10/625] eta 0:06:36 lr 0.000017 wd 0.0500 time 0.5532 (0.6446) data time 0.0007 (0.0455) model time 0.0000 (0.0000) loss 6.7878 (6.5297) grad_norm 1.8434 (2.5911) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:42:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][20/625] eta 0:05:54 lr 0.000017 wd 0.0500 time 0.5200 (0.5868) data time 0.0010 (0.0248) model time 0.0000 (0.0000) loss 7.0805 (6.7376) grad_norm 2.9817 (2.6543) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:42:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][30/625] eta 0:05:40 lr 0.000017 wd 0.0500 time 0.5179 (0.5723) data time 0.0009 (0.0171) model time 0.0000 (0.0000) loss 6.8771 (6.7884) grad_norm 3.5205 (2.6295) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:43:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][40/625] eta 0:05:29 lr 0.000017 wd 0.0500 time 0.5697 (0.5631) data time 0.0010 (0.0133) model time 0.0000 (0.0000) loss 7.8689 (6.7190) grad_norm 2.8059 (2.6859) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:43:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][50/625] eta 0:05:19 lr 0.000017 wd 0.0500 time 0.5188 (0.5558) data time 0.0008 (0.0115) model time 0.0000 (0.0000) loss 6.3683 (6.6289) grad_norm 2.2585 (2.7632) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:43:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][60/625] eta 0:05:10 lr 0.000017 wd 0.0500 time 0.5299 (0.5500) data time 0.0007 (0.0098) model time 0.5292 (0.5198) loss 7.3427 (6.6217) grad_norm 2.0906 (2.7611) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:43:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][70/625] eta 0:05:03 lr 0.000017 wd 0.0500 time 0.5184 (0.5461) data time 0.0008 (0.0086) model time 0.5176 (0.5203) loss 6.2058 (6.5909) grad_norm 2.7838 (2.7560) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:43:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][80/625] eta 0:04:57 lr 0.000017 wd 0.0500 time 0.5188 (0.5457) data time 0.0007 (0.0077) model time 0.5181 (0.5274) loss 7.7794 (6.5950) grad_norm 1.8555 (2.7743) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:43:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][90/625] eta 0:04:50 lr 0.000017 wd 0.0500 time 0.5223 (0.5431) data time 0.0010 (0.0070) model time 0.5214 (0.5259) loss 6.0409 (6.6253) grad_norm 3.3753 (2.7794) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:43:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][100/625] eta 0:04:44 lr 0.000017 wd 0.0500 time 0.5237 (0.5411) data time 0.0007 (0.0064) model time 0.5230 (0.5251) loss 5.8960 (6.5876) grad_norm 2.1490 (2.7712) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:43:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][110/625] eta 0:04:37 lr 0.000017 wd 0.0500 time 0.5188 (0.5394) data time 0.0007 (0.0060) model time 0.5181 (0.5242) loss 7.0048 (6.5887) grad_norm 1.9095 (2.8054) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:43:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][120/625] eta 0:04:31 lr 0.000017 wd 0.0500 time 0.5258 (0.5379) data time 0.0010 (0.0056) model time 0.5247 (0.5237) loss 5.6668 (6.5622) grad_norm 2.3689 (2.7825) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:43:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][130/625] eta 0:04:26 lr 0.000017 wd 0.0500 time 0.5206 (0.5381) data time 0.0008 (0.0052) model time 0.5198 (0.5257) loss 6.2658 (6.5385) grad_norm 2.8389 (2.8313) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:43:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][140/625] eta 0:04:20 lr 0.000017 wd 0.0500 time 0.5280 (0.5371) data time 0.0009 (0.0049) model time 0.5271 (0.5253) loss 6.8057 (6.5221) grad_norm 2.0536 (2.8074) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:44:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][150/625] eta 0:04:14 lr 0.000017 wd 0.0500 time 0.5196 (0.5358) data time 0.0007 (0.0047) model time 0.5189 (0.5245) loss 5.9920 (6.5458) grad_norm 2.9191 (2.7976) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:44:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][160/625] eta 0:04:08 lr 0.000017 wd 0.0500 time 0.5200 (0.5349) data time 0.0008 (0.0044) model time 0.5192 (0.5241) loss 5.0378 (6.5144) grad_norm 2.7365 (2.8021) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:44:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][170/625] eta 0:04:02 lr 0.000017 wd 0.0500 time 0.5226 (0.5340) data time 0.0007 (0.0042) model time 0.5219 (0.5236) loss 6.0338 (6.5039) grad_norm 4.0881 (2.8252) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:44:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][180/625] eta 0:03:57 lr 0.000017 wd 0.0500 time 0.5185 (0.5331) data time 0.0010 (0.0041) model time 0.5175 (0.5231) loss 7.9285 (6.4967) grad_norm 2.6259 (2.8240) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:44:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][190/625] eta 0:03:51 lr 0.000017 wd 0.0500 time 0.5192 (0.5328) data time 0.0008 (0.0039) model time 0.5184 (0.5233) loss 5.5111 (6.4751) grad_norm 5.8051 (2.8410) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:44:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][200/625] eta 0:03:46 lr 0.000017 wd 0.0500 time 0.5191 (0.5332) data time 0.0010 (0.0038) model time 0.5181 (0.5244) loss 7.4757 (6.4846) grad_norm 2.6445 (2.8160) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:44:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][210/625] eta 0:03:41 lr 0.000017 wd 0.0500 time 0.5200 (0.5326) data time 0.0010 (0.0036) model time 0.5191 (0.5241) loss 5.6302 (6.4892) grad_norm 2.0782 (2.8035) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:44:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][220/625] eta 0:03:35 lr 0.000017 wd 0.0500 time 0.5202 (0.5320) data time 0.0010 (0.0035) model time 0.5193 (0.5238) loss 6.6164 (6.4685) grad_norm 2.2113 (2.7892) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:44:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][230/625] eta 0:03:29 lr 0.000017 wd 0.0500 time 0.5197 (0.5315) data time 0.0010 (0.0034) model time 0.5188 (0.5235) loss 7.1368 (6.4717) grad_norm 2.6947 (2.7788) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:44:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][240/625] eta 0:03:24 lr 0.000017 wd 0.0500 time 0.5207 (0.5310) data time 0.0008 (0.0033) model time 0.5200 (0.5233) loss 6.4758 (6.4930) grad_norm 2.0190 (2.7913) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:44:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][250/625] eta 0:03:18 lr 0.000017 wd 0.0500 time 0.5185 (0.5306) data time 0.0010 (0.0032) model time 0.5175 (0.5231) loss 5.1390 (6.4889) grad_norm 2.5655 (2.7864) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:45:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][260/625] eta 0:03:13 lr 0.000017 wd 0.0500 time 0.5199 (0.5303) data time 0.0010 (0.0032) model time 0.5189 (0.5229) loss 6.9204 (6.4923) grad_norm 2.2953 (2.8012) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:45:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][270/625] eta 0:03:08 lr 0.000017 wd 0.0500 time 0.5279 (0.5299) data time 0.0008 (0.0031) model time 0.5271 (0.5228) loss 6.7250 (6.5004) grad_norm 2.0322 (2.8037) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:45:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][280/625] eta 0:03:02 lr 0.000017 wd 0.0500 time 0.5179 (0.5296) data time 0.0007 (0.0030) model time 0.5172 (0.5227) loss 5.8159 (6.4880) grad_norm 2.2129 (2.7989) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:45:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][290/625] eta 0:02:57 lr 0.000017 wd 0.0500 time 0.5194 (0.5292) data time 0.0010 (0.0029) model time 0.5184 (0.5224) loss 7.6573 (6.4866) grad_norm 2.1996 (2.7919) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:45:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][300/625] eta 0:02:51 lr 0.000017 wd 0.0500 time 0.5146 (0.5289) data time 0.0008 (0.0029) model time 0.5138 (0.5223) loss 7.1488 (6.4946) grad_norm 3.3434 (2.7909) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:45:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][310/625] eta 0:02:46 lr 0.000017 wd 0.0500 time 0.5187 (0.5286) data time 0.0008 (0.0028) model time 0.5180 (0.5221) loss 5.9170 (6.4874) grad_norm 6.4286 (2.8074) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:45:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][320/625] eta 0:02:41 lr 0.000017 wd 0.0500 time 0.5187 (0.5283) data time 0.0008 (0.0028) model time 0.5179 (0.5220) loss 6.8109 (6.4947) grad_norm 2.4963 (2.7907) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:45:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][330/625] eta 0:02:35 lr 0.000017 wd 0.0500 time 0.5181 (0.5280) data time 0.0010 (0.0027) model time 0.5172 (0.5218) loss 7.0055 (6.4966) grad_norm 2.5550 (2.7878) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:45:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][340/625] eta 0:02:30 lr 0.000017 wd 0.0500 time 0.5186 (0.5277) data time 0.0008 (0.0027) model time 0.5178 (0.5217) loss 6.0849 (6.4963) grad_norm 2.7986 (2.7832) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:45:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][350/625] eta 0:02:25 lr 0.000017 wd 0.0500 time 0.6913 (0.5281) data time 0.0008 (0.0026) model time 0.6905 (0.5223) loss 7.0032 (6.5060) grad_norm 2.8496 (2.7868) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:45:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][360/625] eta 0:02:19 lr 0.000017 wd 0.0500 time 0.5183 (0.5278) data time 0.0010 (0.0026) model time 0.5173 (0.5221) loss 7.1964 (6.5163) grad_norm 2.0428 (2.7793) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:45:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][370/625] eta 0:02:14 lr 0.000017 wd 0.0500 time 0.5194 (0.5276) data time 0.0010 (0.0025) model time 0.5184 (0.5220) loss 6.7995 (6.5154) grad_norm 2.1402 (2.7789) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 09:46:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][380/625] eta 0:02:09 lr 0.000017 wd 0.0500 time 0.5178 (0.5273) data time 0.0010 (0.0025) model time 0.5169 (0.5218) loss 7.1125 (6.5058) grad_norm 2.9204 (inf) loss_scale 256.0000 (505.2808) mem 22339MB +[2024-07-29 09:46:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][390/625] eta 0:02:03 lr 0.000017 wd 0.0500 time 0.5204 (0.5271) data time 0.0008 (0.0024) model time 0.5196 (0.5217) loss 5.6771 (6.5014) grad_norm 2.1426 (inf) loss_scale 256.0000 (498.9054) mem 22339MB +[2024-07-29 09:46:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][400/625] eta 0:01:58 lr 0.000017 wd 0.0500 time 0.5187 (0.5270) data time 0.0010 (0.0024) model time 0.5177 (0.5217) loss 6.8000 (6.4971) grad_norm 1.9093 (inf) loss_scale 256.0000 (492.8479) mem 22339MB +[2024-07-29 09:46:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][410/625] eta 0:01:53 lr 0.000017 wd 0.0500 time 0.5218 (0.5268) data time 0.0010 (0.0024) model time 0.5209 (0.5217) loss 6.2139 (6.5033) grad_norm 3.3130 (inf) loss_scale 256.0000 (487.0852) mem 22339MB +[2024-07-29 09:46:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][420/625] eta 0:01:48 lr 0.000017 wd 0.0500 time 0.5228 (0.5272) data time 0.0007 (0.0023) model time 0.5221 (0.5221) loss 6.1945 (6.5007) grad_norm 8.1383 (inf) loss_scale 256.0000 (481.5962) mem 22339MB +[2024-07-29 09:46:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][430/625] eta 0:01:42 lr 0.000017 wd 0.0500 time 0.5204 (0.5270) data time 0.0007 (0.0023) model time 0.5196 (0.5220) loss 6.5967 (6.5006) grad_norm 2.2983 (inf) loss_scale 256.0000 (476.3619) mem 22339MB +[2024-07-29 09:46:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][440/625] eta 0:01:37 lr 0.000017 wd 0.0500 time 0.5190 (0.5268) data time 0.0010 (0.0023) model time 0.5180 (0.5219) loss 5.4342 (6.5022) grad_norm 2.8339 (inf) loss_scale 256.0000 (471.3651) mem 22339MB +[2024-07-29 09:46:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][450/625] eta 0:01:32 lr 0.000017 wd 0.0500 time 0.5202 (0.5267) data time 0.0008 (0.0023) model time 0.5195 (0.5219) loss 6.3622 (6.4942) grad_norm 2.5697 (inf) loss_scale 256.0000 (466.5898) mem 22339MB +[2024-07-29 09:46:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][460/625] eta 0:01:26 lr 0.000017 wd 0.0500 time 0.5260 (0.5265) data time 0.0010 (0.0022) model time 0.5251 (0.5218) loss 7.8423 (6.4919) grad_norm 2.2337 (inf) loss_scale 256.0000 (462.0217) mem 22339MB +[2024-07-29 09:46:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][470/625] eta 0:01:21 lr 0.000017 wd 0.0500 time 0.5199 (0.5264) data time 0.0008 (0.0022) model time 0.5191 (0.5217) loss 6.4197 (6.4884) grad_norm 2.2470 (inf) loss_scale 256.0000 (457.6476) mem 22339MB +[2024-07-29 09:46:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][480/625] eta 0:01:16 lr 0.000017 wd 0.0500 time 0.5199 (0.5262) data time 0.0008 (0.0022) model time 0.5191 (0.5217) loss 7.1161 (6.4927) grad_norm 2.4026 (inf) loss_scale 256.0000 (453.4553) mem 22339MB +[2024-07-29 09:47:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][490/625] eta 0:01:11 lr 0.000017 wd 0.0500 time 0.5200 (0.5261) data time 0.0010 (0.0022) model time 0.5191 (0.5216) loss 5.9895 (6.4934) grad_norm 3.3201 (inf) loss_scale 256.0000 (449.4338) mem 22339MB +[2024-07-29 09:47:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][500/625] eta 0:01:05 lr 0.000017 wd 0.0500 time 0.5255 (0.5260) data time 0.0007 (0.0021) model time 0.5248 (0.5215) loss 7.8365 (6.5000) grad_norm 2.4262 (inf) loss_scale 256.0000 (445.5729) mem 22339MB +[2024-07-29 09:47:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][510/625] eta 0:01:00 lr 0.000017 wd 0.0500 time 0.5205 (0.5259) data time 0.0010 (0.0021) model time 0.5195 (0.5215) loss 6.3872 (6.5036) grad_norm 2.3296 (inf) loss_scale 256.0000 (441.8630) mem 22339MB +[2024-07-29 09:47:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][520/625] eta 0:00:55 lr 0.000017 wd 0.0500 time 0.5207 (0.5258) data time 0.0010 (0.0021) model time 0.5197 (0.5214) loss 6.2279 (6.5044) grad_norm 3.0986 (inf) loss_scale 256.0000 (438.2956) mem 22339MB +[2024-07-29 09:47:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][530/625] eta 0:00:49 lr 0.000017 wd 0.0500 time 0.5324 (0.5257) data time 0.0008 (0.0021) model time 0.5317 (0.5214) loss 6.9832 (6.5053) grad_norm 2.1563 (inf) loss_scale 256.0000 (434.8625) mem 22339MB +[2024-07-29 09:47:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][540/625] eta 0:00:44 lr 0.000017 wd 0.0500 time 0.5191 (0.5255) data time 0.0007 (0.0021) model time 0.5183 (0.5213) loss 6.9098 (6.5093) grad_norm 3.4773 (inf) loss_scale 256.0000 (431.5564) mem 22339MB +[2024-07-29 09:47:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][550/625] eta 0:00:39 lr 0.000017 wd 0.0500 time 0.5181 (0.5255) data time 0.0010 (0.0021) model time 0.5171 (0.5213) loss 6.0690 (6.5104) grad_norm 3.1826 (inf) loss_scale 256.0000 (428.3702) mem 22339MB +[2024-07-29 09:47:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][560/625] eta 0:00:34 lr 0.000017 wd 0.0500 time 0.5191 (0.5254) data time 0.0008 (0.0020) model time 0.5183 (0.5212) loss 6.7320 (6.5094) grad_norm 3.9955 (inf) loss_scale 256.0000 (425.2977) mem 22339MB +[2024-07-29 09:47:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][570/625] eta 0:00:28 lr 0.000017 wd 0.0500 time 0.5174 (0.5256) data time 0.0008 (0.0020) model time 0.5166 (0.5216) loss 5.7782 (6.5044) grad_norm 3.7445 (inf) loss_scale 256.0000 (422.3327) mem 22339MB +[2024-07-29 09:47:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][580/625] eta 0:00:23 lr 0.000017 wd 0.0500 time 0.5197 (0.5255) data time 0.0010 (0.0020) model time 0.5188 (0.5215) loss 6.9345 (6.5031) grad_norm 2.8605 (inf) loss_scale 256.0000 (419.4699) mem 22339MB +[2024-07-29 09:47:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][590/625] eta 0:00:18 lr 0.000017 wd 0.0500 time 0.5186 (0.5254) data time 0.0008 (0.0020) model time 0.5178 (0.5215) loss 6.1760 (6.5051) grad_norm 2.7777 (inf) loss_scale 256.0000 (416.7039) mem 22339MB +[2024-07-29 09:47:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][600/625] eta 0:00:13 lr 0.000017 wd 0.0500 time 0.5326 (0.5254) data time 0.0008 (0.0020) model time 0.5319 (0.5215) loss 6.7079 (6.5083) grad_norm 4.8782 (inf) loss_scale 256.0000 (414.0300) mem 22339MB +[2024-07-29 09:48:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][610/625] eta 0:00:07 lr 0.000017 wd 0.0500 time 0.5158 (0.5253) data time 0.0007 (0.0020) model time 0.5151 (0.5215) loss 5.5792 (6.5071) grad_norm 2.2284 (inf) loss_scale 256.0000 (411.4435) mem 22339MB +[2024-07-29 09:48:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [288/300][620/625] eta 0:00:02 lr 0.000017 wd 0.0500 time 0.5189 (0.5252) data time 0.0007 (0.0019) model time 0.5182 (0.5214) loss 5.9823 (6.5069) grad_norm 2.5184 (inf) loss_scale 256.0000 (408.9404) mem 22339MB +[2024-07-29 09:48:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 288 training takes 0:05:28 +[2024-07-29 09:48:10 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 09:48:13 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 09:48:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.570 (0.570) Loss 0.4866 (0.4866) Acc@1 90.625 (90.625) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-29 09:48:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.159) Loss 0.7417 (0.5905) Acc@1 82.812 (88.250) Acc@5 97.217 (98.167) Mem 22339MB +[2024-07-29 09:48:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.139) Loss 0.8105 (0.6754) Acc@1 81.348 (85.749) Acc@5 96.436 (97.456) Mem 22339MB +[2024-07-29 09:48:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.375 Acc@5 97.431 +[2024-07-29 09:48:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.4% +[2024-07-29 09:48:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.955 (0.955) Loss 0.4902 (0.4902) Acc@1 90.479 (90.479) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-29 09:48:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.198) Loss 0.7334 (0.5932) Acc@1 83.252 (88.272) Acc@5 97.363 (98.176) Mem 22339MB +[2024-07-29 09:48:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.159) Loss 0.8110 (0.6773) Acc@1 81.250 (85.656) Acc@5 96.387 (97.421) Mem 22339MB +[2024-07-29 09:48:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.277 Acc@5 97.415 +[2024-07-29 09:48:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.3% +[2024-07-29 09:48:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.28% +[2024-07-29 09:48:20 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 09:48:23 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 09:48:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][0/625] eta 0:10:36 lr 0.000017 wd 0.0500 time 1.0191 (1.0191) data time 0.5016 (0.5016) model time 0.0000 (0.0000) loss 6.2963 (6.2963) grad_norm 3.8646 (3.8646) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:48:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][10/625] eta 0:05:50 lr 0.000017 wd 0.0500 time 0.5132 (0.5691) data time 0.0008 (0.0465) model time 0.0000 (0.0000) loss 5.2803 (6.4083) grad_norm 3.0199 (2.6025) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:48:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][20/625] eta 0:05:36 lr 0.000016 wd 0.0500 time 0.5135 (0.5563) data time 0.0008 (0.0249) model time 0.0000 (0.0000) loss 6.1829 (6.4376) grad_norm 2.9750 (2.6463) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:48:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][30/625] eta 0:05:24 lr 0.000016 wd 0.0500 time 0.5169 (0.5447) data time 0.0010 (0.0172) model time 0.0000 (0.0000) loss 7.6309 (6.5211) grad_norm 2.1637 (2.6074) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:48:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][40/625] eta 0:05:15 lr 0.000016 wd 0.0500 time 0.5175 (0.5388) data time 0.0008 (0.0133) model time 0.0000 (0.0000) loss 5.9155 (6.6323) grad_norm 2.6336 (2.8691) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:48:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][50/625] eta 0:05:07 lr 0.000016 wd 0.0500 time 0.5159 (0.5353) data time 0.0010 (0.0109) model time 0.0000 (0.0000) loss 5.3620 (6.6024) grad_norm 2.2240 (2.7985) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:48:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][60/625] eta 0:05:01 lr 0.000016 wd 0.0500 time 0.5170 (0.5333) data time 0.0010 (0.0093) model time 0.5160 (0.5220) loss 6.4458 (6.6079) grad_norm 2.6861 (2.7626) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:49:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][70/625] eta 0:04:54 lr 0.000016 wd 0.0500 time 0.5201 (0.5314) data time 0.0010 (0.0081) model time 0.5192 (0.5206) loss 6.4792 (6.6330) grad_norm 2.4819 (2.7663) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:49:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][80/625] eta 0:04:48 lr 0.000016 wd 0.0500 time 0.5168 (0.5302) data time 0.0008 (0.0072) model time 0.5160 (0.5205) loss 5.5980 (6.6213) grad_norm 3.1936 (2.7091) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:49:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][90/625] eta 0:04:44 lr 0.000016 wd 0.0500 time 0.7220 (0.5312) data time 0.0010 (0.0066) model time 0.7210 (0.5250) loss 6.5609 (6.5972) grad_norm 5.4023 (2.7593) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:49:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][100/625] eta 0:04:38 lr 0.000016 wd 0.0500 time 0.5214 (0.5300) data time 0.0010 (0.0060) model time 0.5204 (0.5235) loss 6.9269 (6.5793) grad_norm 2.6464 (2.8201) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:49:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][110/625] eta 0:04:32 lr 0.000016 wd 0.0500 time 0.5165 (0.5291) data time 0.0010 (0.0056) model time 0.5155 (0.5228) loss 7.5553 (6.5610) grad_norm 2.4572 (2.8199) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:49:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][120/625] eta 0:04:26 lr 0.000016 wd 0.0500 time 0.5168 (0.5284) data time 0.0011 (0.0052) model time 0.5158 (0.5224) loss 7.6106 (6.5631) grad_norm 2.4155 (2.7909) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:49:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][130/625] eta 0:04:21 lr 0.000016 wd 0.0500 time 0.5146 (0.5279) data time 0.0007 (0.0049) model time 0.5139 (0.5222) loss 6.0970 (6.5425) grad_norm 2.9753 (2.8407) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:49:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][140/625] eta 0:04:15 lr 0.000016 wd 0.0500 time 0.5159 (0.5273) data time 0.0008 (0.0046) model time 0.5152 (0.5217) loss 5.6446 (6.5370) grad_norm 8.4137 (2.8584) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:49:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][150/625] eta 0:04:10 lr 0.000016 wd 0.0500 time 0.5169 (0.5267) data time 0.0011 (0.0044) model time 0.5159 (0.5213) loss 7.0029 (6.5384) grad_norm 2.7873 (2.8237) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:49:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][160/625] eta 0:04:04 lr 0.000016 wd 0.0500 time 0.5176 (0.5263) data time 0.0010 (0.0041) model time 0.5167 (0.5212) loss 7.6980 (6.5416) grad_norm 2.2012 (2.8067) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:49:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][170/625] eta 0:03:59 lr 0.000016 wd 0.0500 time 0.5188 (0.5260) data time 0.0010 (0.0040) model time 0.5178 (0.5211) loss 7.5660 (6.5560) grad_norm 2.5553 (2.8109) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:49:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][180/625] eta 0:03:53 lr 0.000016 wd 0.0500 time 0.5184 (0.5257) data time 0.0007 (0.0038) model time 0.5176 (0.5209) loss 6.5880 (6.5383) grad_norm 6.3363 (2.8352) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:50:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][190/625] eta 0:03:48 lr 0.000016 wd 0.0500 time 0.5168 (0.5254) data time 0.0010 (0.0037) model time 0.5158 (0.5208) loss 6.7165 (6.5356) grad_norm 1.8807 (2.8276) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:50:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][200/625] eta 0:03:43 lr 0.000016 wd 0.0500 time 0.5202 (0.5251) data time 0.0008 (0.0035) model time 0.5195 (0.5206) loss 6.5388 (6.5410) grad_norm 4.3212 (2.8589) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:50:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][210/625] eta 0:03:37 lr 0.000016 wd 0.0500 time 0.5171 (0.5250) data time 0.0008 (0.0034) model time 0.5163 (0.5207) loss 5.6065 (6.5331) grad_norm 3.2464 (2.8658) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:50:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][220/625] eta 0:03:32 lr 0.000016 wd 0.0500 time 0.5170 (0.5248) data time 0.0008 (0.0033) model time 0.5162 (0.5207) loss 6.6446 (6.5432) grad_norm 2.5808 (2.8555) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:50:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][230/625] eta 0:03:27 lr 0.000016 wd 0.0500 time 0.5204 (0.5247) data time 0.0010 (0.0032) model time 0.5195 (0.5206) loss 6.5530 (6.5523) grad_norm 2.5880 (2.8527) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:50:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][240/625] eta 0:03:22 lr 0.000016 wd 0.0500 time 0.5173 (0.5251) data time 0.0010 (0.0031) model time 0.5163 (0.5214) loss 5.9774 (6.5404) grad_norm 2.1791 (2.8341) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:50:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][250/625] eta 0:03:16 lr 0.000016 wd 0.0500 time 0.5190 (0.5249) data time 0.0008 (0.0030) model time 0.5182 (0.5213) loss 5.4152 (6.5273) grad_norm 2.8027 (2.8630) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:50:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][260/625] eta 0:03:11 lr 0.000016 wd 0.0500 time 0.5169 (0.5247) data time 0.0008 (0.0030) model time 0.5162 (0.5211) loss 5.9835 (6.5344) grad_norm 2.4600 (2.8546) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:50:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][270/625] eta 0:03:06 lr 0.000016 wd 0.0500 time 0.5170 (0.5245) data time 0.0010 (0.0029) model time 0.5160 (0.5210) loss 6.8284 (6.5369) grad_norm 2.6856 (2.8503) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:50:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][280/625] eta 0:03:00 lr 0.000016 wd 0.0500 time 0.5167 (0.5244) data time 0.0009 (0.0028) model time 0.5157 (0.5209) loss 6.8406 (6.5451) grad_norm 3.5149 (2.8534) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:50:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][290/625] eta 0:02:55 lr 0.000016 wd 0.0500 time 0.5203 (0.5242) data time 0.0010 (0.0028) model time 0.5193 (0.5208) loss 6.2755 (6.5440) grad_norm 2.2385 (2.8469) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:51:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][300/625] eta 0:02:50 lr 0.000016 wd 0.0500 time 0.5166 (0.5241) data time 0.0007 (0.0027) model time 0.5159 (0.5208) loss 6.5375 (6.5433) grad_norm 2.7862 (2.9028) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:51:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][310/625] eta 0:02:45 lr 0.000016 wd 0.0500 time 0.5118 (0.5245) data time 0.0009 (0.0027) model time 0.5110 (0.5213) loss 6.9253 (6.5544) grad_norm 2.4599 (2.8912) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:51:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][320/625] eta 0:02:39 lr 0.000016 wd 0.0500 time 0.5165 (0.5244) data time 0.0009 (0.0026) model time 0.5155 (0.5213) loss 5.4513 (6.5423) grad_norm 1.9918 (2.8797) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:51:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][330/625] eta 0:02:34 lr 0.000016 wd 0.0500 time 0.5208 (0.5243) data time 0.0008 (0.0026) model time 0.5201 (0.5212) loss 5.9321 (6.5356) grad_norm 6.4918 (2.8932) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:51:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][340/625] eta 0:02:29 lr 0.000016 wd 0.0500 time 0.5238 (0.5242) data time 0.0009 (0.0025) model time 0.5229 (0.5212) loss 5.8857 (6.5491) grad_norm 2.4007 (2.9014) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:51:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][350/625] eta 0:02:24 lr 0.000016 wd 0.0500 time 0.5130 (0.5241) data time 0.0010 (0.0025) model time 0.5120 (0.5211) loss 6.3348 (6.5494) grad_norm 3.9409 (2.9056) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:51:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][360/625] eta 0:02:18 lr 0.000016 wd 0.0500 time 0.5172 (0.5239) data time 0.0010 (0.0024) model time 0.5162 (0.5210) loss 7.1976 (6.5569) grad_norm 3.4508 (2.9316) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:51:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][370/625] eta 0:02:13 lr 0.000016 wd 0.0500 time 0.5156 (0.5240) data time 0.0010 (0.0024) model time 0.5146 (0.5211) loss 7.4256 (6.5534) grad_norm 2.4352 (2.9225) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:51:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][380/625] eta 0:02:08 lr 0.000016 wd 0.0500 time 0.5139 (0.5239) data time 0.0010 (0.0024) model time 0.5129 (0.5211) loss 5.2463 (6.5550) grad_norm 3.0856 (2.9148) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:51:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][390/625] eta 0:02:03 lr 0.000016 wd 0.0500 time 0.5176 (0.5239) data time 0.0010 (0.0023) model time 0.5166 (0.5211) loss 5.2880 (6.5482) grad_norm 2.1588 (2.9111) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:51:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][400/625] eta 0:01:57 lr 0.000016 wd 0.0500 time 0.5139 (0.5237) data time 0.0009 (0.0023) model time 0.5129 (0.5210) loss 6.9480 (6.5463) grad_norm 2.5316 (2.9117) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:51:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][410/625] eta 0:01:52 lr 0.000016 wd 0.0500 time 0.5221 (0.5237) data time 0.0010 (0.0023) model time 0.5211 (0.5210) loss 7.6302 (6.5458) grad_norm 3.3035 (2.9132) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:52:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][420/625] eta 0:01:47 lr 0.000016 wd 0.0500 time 0.5149 (0.5236) data time 0.0009 (0.0022) model time 0.5140 (0.5209) loss 6.1138 (6.5494) grad_norm 3.2881 (2.9140) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:52:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][430/625] eta 0:01:42 lr 0.000016 wd 0.0500 time 0.5172 (0.5235) data time 0.0008 (0.0022) model time 0.5164 (0.5209) loss 6.8188 (6.5482) grad_norm 2.1805 (2.9013) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:52:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][440/625] eta 0:01:36 lr 0.000016 wd 0.0500 time 0.5134 (0.5235) data time 0.0010 (0.0022) model time 0.5124 (0.5209) loss 7.2792 (6.5485) grad_norm 2.3191 (2.9016) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:52:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][450/625] eta 0:01:31 lr 0.000016 wd 0.0500 time 0.5144 (0.5234) data time 0.0008 (0.0022) model time 0.5137 (0.5209) loss 5.4607 (6.5449) grad_norm 2.4653 (2.8960) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:52:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][460/625] eta 0:01:26 lr 0.000016 wd 0.0500 time 0.5164 (0.5237) data time 0.0007 (0.0021) model time 0.5157 (0.5212) loss 6.0299 (6.5406) grad_norm 1.7415 (2.8874) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:52:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][470/625] eta 0:01:21 lr 0.000016 wd 0.0500 time 0.5169 (0.5236) data time 0.0007 (0.0021) model time 0.5162 (0.5212) loss 8.0708 (6.5471) grad_norm 3.0534 (2.8780) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:52:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][480/625] eta 0:01:15 lr 0.000016 wd 0.0500 time 0.5179 (0.5236) data time 0.0010 (0.0021) model time 0.5170 (0.5211) loss 6.4164 (6.5536) grad_norm 2.0315 (2.8802) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:52:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][490/625] eta 0:01:10 lr 0.000016 wd 0.0500 time 0.5127 (0.5235) data time 0.0010 (0.0021) model time 0.5117 (0.5211) loss 7.0347 (6.5550) grad_norm 2.1379 (2.8735) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:52:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][500/625] eta 0:01:05 lr 0.000016 wd 0.0500 time 0.5150 (0.5235) data time 0.0010 (0.0020) model time 0.5140 (0.5211) loss 6.1940 (6.5541) grad_norm 3.8915 (2.8750) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:52:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][510/625] eta 0:01:00 lr 0.000016 wd 0.0500 time 0.5170 (0.5234) data time 0.0009 (0.0020) model time 0.5161 (0.5210) loss 6.4455 (6.5485) grad_norm 1.8872 (2.8789) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:52:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][520/625] eta 0:00:54 lr 0.000016 wd 0.0500 time 0.5160 (0.5233) data time 0.0010 (0.0020) model time 0.5150 (0.5210) loss 7.6049 (6.5514) grad_norm 2.6248 (2.8767) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:53:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][530/625] eta 0:00:49 lr 0.000016 wd 0.0500 time 0.7060 (0.5237) data time 0.0010 (0.0020) model time 0.7051 (0.5214) loss 6.2619 (6.5453) grad_norm 2.7129 (2.8702) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:53:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][540/625] eta 0:00:44 lr 0.000016 wd 0.0500 time 0.5158 (0.5236) data time 0.0010 (0.0020) model time 0.5148 (0.5213) loss 6.3903 (6.5471) grad_norm 2.8883 (2.8636) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:53:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][550/625] eta 0:00:39 lr 0.000016 wd 0.0500 time 0.5178 (0.5235) data time 0.0009 (0.0020) model time 0.5169 (0.5212) loss 7.2065 (6.5439) grad_norm 2.1862 (2.8546) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:53:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][560/625] eta 0:00:34 lr 0.000016 wd 0.0500 time 0.5187 (0.5234) data time 0.0010 (0.0019) model time 0.5177 (0.5212) loss 6.7237 (6.5475) grad_norm 3.2418 (2.8567) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:53:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][570/625] eta 0:00:28 lr 0.000016 wd 0.0500 time 0.5169 (0.5233) data time 0.0007 (0.0019) model time 0.5162 (0.5211) loss 6.8531 (6.5471) grad_norm 2.6645 (2.8552) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:53:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][580/625] eta 0:00:23 lr 0.000016 wd 0.0500 time 0.5144 (0.5233) data time 0.0010 (0.0019) model time 0.5134 (0.5211) loss 6.3828 (6.5500) grad_norm 2.8421 (2.8493) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:53:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][590/625] eta 0:00:18 lr 0.000016 wd 0.0500 time 0.5181 (0.5232) data time 0.0007 (0.0019) model time 0.5173 (0.5211) loss 6.1356 (6.5540) grad_norm 4.1991 (2.8579) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:53:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][600/625] eta 0:00:13 lr 0.000016 wd 0.0500 time 0.5138 (0.5232) data time 0.0008 (0.0019) model time 0.5130 (0.5210) loss 6.8146 (6.5585) grad_norm 2.9993 (2.8506) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:53:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][610/625] eta 0:00:07 lr 0.000016 wd 0.0500 time 0.5101 (0.5232) data time 0.0005 (0.0019) model time 0.5096 (0.5210) loss 7.2192 (6.5554) grad_norm 4.5737 (2.8521) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:53:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [289/300][620/625] eta 0:00:02 lr 0.000016 wd 0.0500 time 0.5144 (0.5231) data time 0.0005 (0.0019) model time 0.5139 (0.5209) loss 5.7247 (6.5467) grad_norm 5.5460 (2.8528) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:53:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 289 training takes 0:05:26 +[2024-07-29 09:53:50 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 09:53:53 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 09:53:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.570 (0.570) Loss 0.4863 (0.4863) Acc@1 90.479 (90.479) Acc@5 99.072 (99.072) Mem 22339MB +[2024-07-29 09:53:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.159) Loss 0.7354 (0.5891) Acc@1 83.057 (88.277) Acc@5 97.266 (98.176) Mem 22339MB +[2024-07-29 09:53:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.139) Loss 0.8086 (0.6733) Acc@1 81.006 (85.721) Acc@5 96.484 (97.438) Mem 22339MB +[2024-07-29 09:53:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.385 Acc@5 97.419 +[2024-07-29 09:53:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.4% +[2024-07-29 09:53:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 1.015 (1.015) Loss 0.4900 (0.4900) Acc@1 90.479 (90.479) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-29 09:53:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.200) Loss 0.7334 (0.5928) Acc@1 83.301 (88.281) Acc@5 97.363 (98.171) Mem 22339MB +[2024-07-29 09:53:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.160) Loss 0.8105 (0.6769) Acc@1 81.201 (85.663) Acc@5 96.338 (97.421) Mem 22339MB +[2024-07-29 09:54:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.279 Acc@5 97.413 +[2024-07-29 09:54:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.3% +[2024-07-29 09:54:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.28% +[2024-07-29 09:54:00 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 09:54:03 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 09:54:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][0/625] eta 0:11:03 lr 0.000016 wd 0.0500 time 1.0611 (1.0611) data time 0.5487 (0.5487) model time 0.0000 (0.0000) loss 6.6461 (6.6461) grad_norm 1.8345 (1.8345) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:54:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][10/625] eta 0:05:52 lr 0.000016 wd 0.0500 time 0.5425 (0.5736) data time 0.0007 (0.0509) model time 0.0000 (0.0000) loss 7.2443 (6.7886) grad_norm 2.8001 (2.4842) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:54:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][20/625] eta 0:05:32 lr 0.000016 wd 0.0500 time 0.5184 (0.5491) data time 0.0007 (0.0273) model time 0.0000 (0.0000) loss 6.7203 (6.7427) grad_norm 2.8931 (2.5460) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:54:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][30/625] eta 0:05:21 lr 0.000016 wd 0.0500 time 0.5210 (0.5396) data time 0.0010 (0.0188) model time 0.0000 (0.0000) loss 6.0464 (6.7437) grad_norm 3.0692 (2.6343) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:54:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][40/625] eta 0:05:13 lr 0.000016 wd 0.0500 time 0.5437 (0.5361) data time 0.0010 (0.0145) model time 0.0000 (0.0000) loss 5.8468 (6.7792) grad_norm 1.9413 (3.6458) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:54:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][50/625] eta 0:05:08 lr 0.000016 wd 0.0500 time 0.7418 (0.5372) data time 0.0011 (0.0120) model time 0.0000 (0.0000) loss 7.2752 (6.7582) grad_norm 2.0607 (3.4477) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:54:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][60/625] eta 0:05:03 lr 0.000016 wd 0.0500 time 0.7216 (0.5376) data time 0.0008 (0.0102) model time 0.7208 (0.5384) loss 6.5657 (6.6830) grad_norm 2.7017 (3.3869) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:54:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][70/625] eta 0:04:56 lr 0.000016 wd 0.0500 time 0.5243 (0.5349) data time 0.0007 (0.0089) model time 0.5236 (0.5282) loss 6.0046 (6.6827) grad_norm 1.9280 (3.2864) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:54:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][80/625] eta 0:04:50 lr 0.000016 wd 0.0500 time 0.5184 (0.5329) data time 0.0008 (0.0079) model time 0.5176 (0.5245) loss 6.1903 (6.6927) grad_norm 1.8435 (3.1717) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:54:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][90/625] eta 0:04:44 lr 0.000016 wd 0.0500 time 0.5160 (0.5312) data time 0.0010 (0.0072) model time 0.5150 (0.5225) loss 7.1481 (6.6730) grad_norm 2.1746 (3.0944) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:54:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][100/625] eta 0:04:38 lr 0.000016 wd 0.0500 time 0.5175 (0.5301) data time 0.0007 (0.0065) model time 0.5167 (0.5217) loss 6.6729 (6.6599) grad_norm 3.5858 (3.1490) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:55:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][110/625] eta 0:04:32 lr 0.000016 wd 0.0500 time 0.5288 (0.5291) data time 0.0010 (0.0061) model time 0.5278 (0.5212) loss 5.8480 (6.6638) grad_norm 5.9331 (3.1385) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:55:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][120/625] eta 0:04:26 lr 0.000016 wd 0.0500 time 0.5214 (0.5283) data time 0.0008 (0.0056) model time 0.5207 (0.5208) loss 5.4809 (6.6510) grad_norm 2.7716 (3.1401) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:55:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][130/625] eta 0:04:21 lr 0.000016 wd 0.0500 time 0.5189 (0.5277) data time 0.0010 (0.0053) model time 0.5180 (0.5207) loss 6.0810 (6.6667) grad_norm 2.0999 (3.1138) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:55:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][140/625] eta 0:04:15 lr 0.000016 wd 0.0500 time 0.5258 (0.5272) data time 0.0010 (0.0050) model time 0.5248 (0.5204) loss 6.6657 (6.6761) grad_norm 3.3812 (3.0941) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:55:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][150/625] eta 0:04:10 lr 0.000016 wd 0.0500 time 0.5184 (0.5266) data time 0.0008 (0.0047) model time 0.5175 (0.5202) loss 6.3836 (6.6491) grad_norm 1.9208 (3.1525) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:55:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][160/625] eta 0:04:04 lr 0.000016 wd 0.0500 time 0.5247 (0.5262) data time 0.0007 (0.0045) model time 0.5240 (0.5201) loss 5.6707 (6.6045) grad_norm 2.5626 (3.1088) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:55:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][170/625] eta 0:03:59 lr 0.000016 wd 0.0500 time 0.5241 (0.5258) data time 0.0009 (0.0043) model time 0.5232 (0.5200) loss 6.2832 (6.5842) grad_norm 2.3237 (3.0939) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:55:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][180/625] eta 0:03:53 lr 0.000016 wd 0.0500 time 0.5191 (0.5255) data time 0.0010 (0.0041) model time 0.5181 (0.5198) loss 5.8610 (6.5919) grad_norm 1.9311 (3.0799) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:55:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][190/625] eta 0:03:48 lr 0.000016 wd 0.0500 time 0.5198 (0.5251) data time 0.0010 (0.0040) model time 0.5188 (0.5197) loss 6.8915 (6.5965) grad_norm 2.9018 (3.0504) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:55:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][200/625] eta 0:03:43 lr 0.000015 wd 0.0500 time 0.5225 (0.5249) data time 0.0010 (0.0038) model time 0.5215 (0.5197) loss 6.2527 (6.5970) grad_norm 2.6276 (3.0569) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:55:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][210/625] eta 0:03:37 lr 0.000015 wd 0.0500 time 0.5187 (0.5247) data time 0.0009 (0.0037) model time 0.5178 (0.5196) loss 5.9039 (6.5891) grad_norm 7.7418 (3.0911) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:55:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][220/625] eta 0:03:32 lr 0.000015 wd 0.0500 time 0.5295 (0.5245) data time 0.0009 (0.0036) model time 0.5286 (0.5197) loss 5.2196 (6.5723) grad_norm 2.9741 (3.0777) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:56:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][230/625] eta 0:03:27 lr 0.000015 wd 0.0500 time 0.5201 (0.5243) data time 0.0010 (0.0034) model time 0.5191 (0.5196) loss 6.6557 (6.5736) grad_norm 2.4541 (3.1902) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:56:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][240/625] eta 0:03:21 lr 0.000015 wd 0.0500 time 0.5197 (0.5241) data time 0.0010 (0.0033) model time 0.5187 (0.5195) loss 6.8263 (6.5557) grad_norm 2.2768 (3.1932) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 09:56:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 09:56:13 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 09:56:17 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 10:02:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 10:02:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 10:03:18 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 10:03:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 10:03:36 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 10:03:37 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 10:03:37 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 10:03:37 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 290) +[2024-07-29 10:03:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 10:03:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][250/625] eta 0:27:05 lr 0.000015 wd 0.0500 time 0.5914 (4.3356) data time 0.0008 (0.2551) model time 0.5906 (4.0805) loss 5.8601 (6.6444) grad_norm 2.3467 (2.5083) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:04:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][260/625] eta 0:08:52 lr 0.000015 wd 0.0500 time 0.5938 (1.4585) data time 0.0010 (0.0597) model time 0.5928 (1.3989) loss 6.4583 (6.7391) grad_norm 14.8232 (4.4015) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:04:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][270/625] eta 0:06:24 lr 0.000015 wd 0.0500 time 0.5945 (1.0832) data time 0.0008 (0.0342) model time 0.5937 (1.0490) loss 6.9087 (6.7810) grad_norm 2.7016 (4.0373) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:04:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][280/625] eta 0:05:22 lr 0.000015 wd 0.0500 time 0.5934 (0.9344) data time 0.0009 (0.0241) model time 0.5925 (0.9102) loss 7.2246 (6.7339) grad_norm 3.2240 (3.9823) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:04:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][290/625] eta 0:04:46 lr 0.000015 wd 0.0500 time 0.5886 (0.8550) data time 0.0010 (0.0188) model time 0.5876 (0.8362) loss 6.8681 (6.6386) grad_norm 1.9326 (3.6461) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:04:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][300/625] eta 0:04:22 lr 0.000015 wd 0.0500 time 0.5976 (0.8086) data time 0.0011 (0.0154) model time 0.5965 (0.7932) loss 6.3548 (6.5956) grad_norm 1.8211 (3.5685) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:04:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][310/625] eta 0:04:05 lr 0.000015 wd 0.0500 time 0.5927 (0.7792) data time 0.0011 (0.0132) model time 0.5916 (0.7661) loss 5.8486 (6.5872) grad_norm 2.1762 (3.6132) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:04:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][320/625] eta 0:03:50 lr 0.000015 wd 0.0500 time 0.5974 (0.7546) data time 0.0011 (0.0115) model time 0.5963 (0.7431) loss 7.4871 (6.5697) grad_norm 6.3674 (3.5072) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:04:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][330/625] eta 0:03:37 lr 0.000015 wd 0.0500 time 0.6010 (0.7364) data time 0.0008 (0.0103) model time 0.6002 (0.7261) loss 5.7614 (6.5714) grad_norm 3.5342 (3.3999) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:04:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][340/625] eta 0:03:25 lr 0.000015 wd 0.0500 time 0.6019 (0.7217) data time 0.0008 (0.0093) model time 0.6011 (0.7125) loss 8.0466 (6.5624) grad_norm 2.2647 (3.3387) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:04:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][350/625] eta 0:03:15 lr 0.000015 wd 0.0500 time 0.5922 (0.7094) data time 0.0008 (0.0085) model time 0.5913 (0.7009) loss 7.1794 (6.5925) grad_norm 2.9215 (3.2727) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:05:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][360/625] eta 0:03:05 lr 0.000015 wd 0.0500 time 0.5927 (0.6995) data time 0.0010 (0.0079) model time 0.5916 (0.6916) loss 5.7496 (6.5710) grad_norm 3.3540 (3.1825) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:05:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][370/625] eta 0:02:56 lr 0.000015 wd 0.0500 time 0.5985 (0.6909) data time 0.0011 (0.0073) model time 0.5975 (0.6836) loss 6.6417 (6.5858) grad_norm 3.2558 (3.1737) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:05:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][380/625] eta 0:02:47 lr 0.000015 wd 0.0500 time 0.5996 (0.6840) data time 0.0010 (0.0068) model time 0.5986 (0.6771) loss 7.1189 (6.5813) grad_norm 2.5592 (3.1239) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:05:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][390/625] eta 0:02:39 lr 0.000015 wd 0.0500 time 0.6017 (0.6782) data time 0.0011 (0.0064) model time 0.6006 (0.6718) loss 6.7131 (6.5813) grad_norm 2.6532 (3.0991) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:05:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][400/625] eta 0:02:31 lr 0.000015 wd 0.0500 time 0.5990 (0.6733) data time 0.0008 (0.0061) model time 0.5982 (0.6672) loss 7.0920 (6.5727) grad_norm 2.4672 (3.0842) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:05:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][410/625] eta 0:02:23 lr 0.000015 wd 0.0500 time 0.5985 (0.6689) data time 0.0007 (0.0058) model time 0.5977 (0.6631) loss 5.6408 (6.5717) grad_norm 1.9110 (3.0789) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:05:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][420/625] eta 0:02:16 lr 0.000015 wd 0.0500 time 0.5932 (0.6648) data time 0.0010 (0.0055) model time 0.5922 (0.6593) loss 7.7767 (6.5823) grad_norm 3.8814 (3.0717) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:05:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][430/625] eta 0:02:08 lr 0.000015 wd 0.0500 time 0.5950 (0.6611) data time 0.0010 (0.0053) model time 0.5940 (0.6559) loss 7.1627 (6.5794) grad_norm 3.6106 (3.1116) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:05:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][440/625] eta 0:02:01 lr 0.000015 wd 0.0500 time 0.5965 (0.6579) data time 0.0008 (0.0050) model time 0.5957 (0.6528) loss 6.5060 (6.5645) grad_norm 2.4032 (3.0817) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:05:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][450/625] eta 0:01:54 lr 0.000015 wd 0.0500 time 0.5998 (0.6549) data time 0.0008 (0.0048) model time 0.5990 (0.6501) loss 5.4382 (6.5361) grad_norm 2.8787 (3.0577) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:06:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][460/625] eta 0:01:47 lr 0.000015 wd 0.0500 time 0.6021 (0.6525) data time 0.0010 (0.0047) model time 0.6011 (0.6479) loss 5.8029 (6.5367) grad_norm 2.8431 (3.0379) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:06:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][470/625] eta 0:01:40 lr 0.000015 wd 0.0500 time 0.6027 (0.6499) data time 0.0007 (0.0045) model time 0.6020 (0.6455) loss 5.9791 (6.5434) grad_norm 2.1558 (3.0669) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:06:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][480/625] eta 0:01:33 lr 0.000015 wd 0.0500 time 0.6027 (0.6480) data time 0.0010 (0.0043) model time 0.6017 (0.6436) loss 5.6505 (6.5399) grad_norm 2.7917 (3.0745) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:06:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][490/625] eta 0:01:27 lr 0.000015 wd 0.0500 time 0.5974 (0.6460) data time 0.0010 (0.0042) model time 0.5964 (0.6418) loss 7.6434 (6.5509) grad_norm 2.1078 (3.0663) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:06:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][500/625] eta 0:01:20 lr 0.000015 wd 0.0500 time 0.5992 (0.6440) data time 0.0010 (0.0041) model time 0.5981 (0.6400) loss 7.1936 (6.5490) grad_norm 2.5490 (3.0434) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:06:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][510/625] eta 0:01:13 lr 0.000015 wd 0.0500 time 0.5919 (0.6423) data time 0.0008 (0.0040) model time 0.5910 (0.6383) loss 6.3854 (6.5340) grad_norm 3.4227 (3.0441) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:06:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][520/625] eta 0:01:07 lr 0.000015 wd 0.0500 time 0.5950 (0.6411) data time 0.0009 (0.0038) model time 0.5941 (0.6373) loss 7.5297 (6.5270) grad_norm 2.3852 (3.0633) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:06:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][530/625] eta 0:01:00 lr 0.000015 wd 0.0500 time 0.6038 (0.6405) data time 0.0010 (0.0038) model time 0.6028 (0.6367) loss 7.7499 (6.5347) grad_norm 2.1264 (3.1166) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:06:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 10:06:49 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 10:06:52 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 10:09:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 10:09:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 10:09:51 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 10:10:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 10:10:19 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 10:10:20 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 10:10:20 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 10:10:20 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 290) +[2024-07-29 10:10:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 10:10:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][540/625] eta 0:15:26 lr 0.000015 wd 0.0500 time 10.9022 (10.9022) data time 1.1312 (1.1312) model time 9.7710 (9.7710) loss 6.7991 (6.7991) grad_norm 2.1661 (2.1661) loss_scale 256.0000 (256.0000) mem 26016MB +[2024-07-29 10:10:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][550/625] eta 0:02:02 lr 0.000015 wd 0.0500 time 0.5177 (1.6359) data time 0.0009 (0.1037) model time 0.5167 (1.5321) loss 6.3788 (6.7777) grad_norm 3.2490 (3.0157) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:10:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][560/625] eta 0:01:11 lr 0.000015 wd 0.0500 time 0.5180 (1.1077) data time 0.0009 (0.0554) model time 0.5171 (1.0523) loss 5.8394 (6.6466) grad_norm 2.6741 (2.8115) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:10:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][570/625] eta 0:00:50 lr 0.000015 wd 0.0500 time 0.5172 (0.9207) data time 0.0007 (0.0378) model time 0.5165 (0.8829) loss 5.7451 (6.6853) grad_norm 2.0073 (2.6862) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:11:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][580/625] eta 0:00:37 lr 0.000015 wd 0.0500 time 0.5164 (0.8230) data time 0.0010 (0.0289) model time 0.5154 (0.7941) loss 7.2994 (6.6522) grad_norm 2.9780 (2.8163) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:11:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][590/625] eta 0:00:26 lr 0.000015 wd 0.0500 time 0.7374 (0.7698) data time 0.0008 (0.0234) model time 0.7365 (0.7464) loss 6.7159 (6.6041) grad_norm 2.5345 (2.7653) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:11:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][600/625] eta 0:00:18 lr 0.000015 wd 0.0500 time 0.5161 (0.7350) data time 0.0011 (0.0198) model time 0.5150 (0.7152) loss 6.5458 (6.5778) grad_norm 3.1186 (2.8284) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:11:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][610/625] eta 0:00:10 lr 0.000015 wd 0.0500 time 0.5153 (0.7086) data time 0.0007 (0.0175) model time 0.5146 (0.6911) loss 5.5756 (6.5204) grad_norm 1.8037 (2.8083) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:11:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [290/300][620/625] eta 0:00:03 lr 0.000015 wd 0.0500 time 0.5150 (0.6857) data time 0.0007 (0.0154) model time 0.5143 (0.6703) loss 6.1236 (6.5163) grad_norm 2.1661 (2.8412) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:11:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 290 training takes 0:00:57 +[2024-07-29 10:11:25 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 10:11:29 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 10:11:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.592 (0.592) Loss 0.4880 (0.4880) Acc@1 90.430 (90.430) Acc@5 98.975 (98.975) Mem 22344MB +[2024-07-29 10:11:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.161) Loss 0.7412 (0.5901) Acc@1 83.057 (88.250) Acc@5 97.217 (98.193) Mem 22344MB +[2024-07-29 10:11:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.140) Loss 0.8052 (0.6753) Acc@1 81.494 (85.705) Acc@5 96.240 (97.435) Mem 22344MB +[2024-07-29 10:11:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.367 Acc@5 97.423 +[2024-07-29 10:11:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.4% +[2024-07-29 10:11:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 1.045 (1.045) Loss 0.4900 (0.4900) Acc@1 90.381 (90.381) Acc@5 99.023 (99.023) Mem 22344MB +[2024-07-29 10:11:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.205) Loss 0.7329 (0.5925) Acc@1 83.301 (88.286) Acc@5 97.363 (98.171) Mem 22344MB +[2024-07-29 10:11:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.163) Loss 0.8101 (0.6767) Acc@1 81.201 (85.658) Acc@5 96.289 (97.419) Mem 22344MB +[2024-07-29 10:11:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.279 Acc@5 97.411 +[2024-07-29 10:11:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.3% +[2024-07-29 10:11:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.28% +[2024-07-29 10:11:39 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 10:11:41 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 10:11:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][0/625] eta 0:17:48 lr 0.000015 wd 0.0500 time 1.7091 (1.7091) data time 0.5342 (0.5342) model time 0.0000 (0.0000) loss 6.3529 (6.3529) grad_norm 3.1537 (3.1537) loss_scale 256.0000 (256.0000) mem 22337MB +[2024-07-29 10:11:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][10/625] eta 0:06:33 lr 0.000015 wd 0.0500 time 0.5156 (0.6391) data time 0.0010 (0.0495) model time 0.0000 (0.0000) loss 6.6042 (6.6704) grad_norm 2.4219 (2.8615) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 10:11:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][20/625] eta 0:05:53 lr 0.000015 wd 0.0500 time 0.5176 (0.5851) data time 0.0010 (0.0278) model time 0.0000 (0.0000) loss 6.8862 (6.6806) grad_norm 1.9006 (3.2052) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 10:11:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][30/625] eta 0:05:36 lr 0.000015 wd 0.0500 time 0.5393 (0.5647) data time 0.0008 (0.0191) model time 0.0000 (0.0000) loss 6.5348 (6.5708) grad_norm 2.9626 (3.0678) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 10:12:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][40/625] eta 0:05:24 lr 0.000015 wd 0.0500 time 0.5159 (0.5554) data time 0.0008 (0.0147) model time 0.0000 (0.0000) loss 5.4654 (6.5070) grad_norm 3.0016 (2.9556) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 10:12:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][50/625] eta 0:05:16 lr 0.000015 wd 0.0500 time 0.5543 (0.5501) data time 0.0011 (0.0121) model time 0.0000 (0.0000) loss 5.8137 (6.5363) grad_norm 2.3653 (2.8895) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 10:12:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][60/625] eta 0:05:08 lr 0.000015 wd 0.0500 time 0.5163 (0.5462) data time 0.0008 (0.0102) model time 0.5155 (0.5256) loss 5.4686 (6.4966) grad_norm 2.0194 (2.8460) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 10:12:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][70/625] eta 0:05:01 lr 0.000015 wd 0.0500 time 0.5754 (0.5437) data time 0.0010 (0.0089) model time 0.5744 (0.5265) loss 6.5889 (6.5081) grad_norm 3.4185 (2.8369) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 10:12:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][80/625] eta 0:04:55 lr 0.000015 wd 0.0500 time 0.6009 (0.5424) data time 0.0009 (0.0080) model time 0.5999 (0.5282) loss 6.8960 (6.5326) grad_norm 3.9518 (2.9475) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 10:12:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 10:12:28 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 10:12:29 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 10:17:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 10:17:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 10:18:03 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 10:19:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 10:19:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 10:20:12 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 10:20:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 10:20:26 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 10:20:27 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 10:20:27 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 10:20:27 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 291) +[2024-07-29 10:20:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 10:20:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][90/625] eta 0:25:50 lr 0.000015 wd 0.0500 time 0.5639 (2.8981) data time 0.0007 (0.2068) model time 0.5633 (2.6913) loss 7.0701 (6.6221) grad_norm 2.1516 (2.6090) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:20:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][100/625] eta 0:10:46 lr 0.000015 wd 0.0500 time 0.5654 (1.2314) data time 0.0007 (0.0598) model time 0.5647 (1.1716) loss 6.1376 (6.5682) grad_norm 3.1704 (2.6532) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:20:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][110/625] eta 0:08:11 lr 0.000015 wd 0.0500 time 0.5652 (0.9536) data time 0.0008 (0.0353) model time 0.5644 (0.9183) loss 6.9631 (6.6871) grad_norm 2.4907 (2.6851) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:20:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][120/625] eta 0:07:03 lr 0.000015 wd 0.0500 time 0.5677 (0.8395) data time 0.0006 (0.0251) model time 0.5671 (0.8144) loss 6.0479 (6.6890) grad_norm 3.2535 (2.8070) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:21:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][130/625] eta 0:06:25 lr 0.000015 wd 0.0500 time 0.5675 (0.7778) data time 0.0007 (0.0197) model time 0.5669 (0.7582) loss 6.3482 (6.6752) grad_norm 1.8762 (2.8536) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:21:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 10:21:07 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 10:21:12 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 10:25:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 10:25:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 10:26:04 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 10:26:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 10:26:17 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 10:26:17 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 10:26:17 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 10:26:17 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 291) +[2024-07-29 10:26:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 10:26:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][140/625] eta 0:15:24 lr 0.000015 wd 0.0500 time 0.5751 (1.9055) data time 0.0010 (0.1062) model time 0.5741 (1.7993) loss 6.9907 (6.7843) grad_norm 2.6523 (2.5396) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:26:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][150/625] eta 0:08:53 lr 0.000015 wd 0.0500 time 0.5736 (1.1225) data time 0.0008 (0.0442) model time 0.5727 (1.0782) loss 6.3657 (6.6704) grad_norm 3.5857 (2.7728) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:26:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][160/625] eta 0:07:07 lr 0.000015 wd 0.0500 time 0.5711 (0.9191) data time 0.0007 (0.0282) model time 0.5705 (0.8910) loss 6.2379 (6.5754) grad_norm 1.9695 (2.6054) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:26:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][170/625] eta 0:06:15 lr 0.000015 wd 0.0500 time 0.5714 (0.8256) data time 0.0009 (0.0208) model time 0.5705 (0.8048) loss 6.4773 (6.6064) grad_norm 2.1355 (2.6394) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:26:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][180/625] eta 0:05:43 lr 0.000015 wd 0.0500 time 0.5724 (0.7720) data time 0.0006 (0.0165) model time 0.5718 (0.7555) loss 6.2663 (6.5488) grad_norm 3.4013 (2.7461) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:27:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][190/625] eta 0:05:23 lr 0.000015 wd 0.0500 time 0.5745 (0.7434) data time 0.0009 (0.0138) model time 0.5736 (0.7296) loss 7.0937 (6.5444) grad_norm 3.2413 (2.7449) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:27:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][200/625] eta 0:05:05 lr 0.000015 wd 0.0500 time 0.5784 (0.7188) data time 0.0009 (0.0119) model time 0.5775 (0.7070) loss 7.2809 (6.5318) grad_norm 3.7524 (2.8948) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:27:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][210/625] eta 0:04:50 lr 0.000015 wd 0.0500 time 0.5776 (0.7006) data time 0.0009 (0.0104) model time 0.5767 (0.6901) loss 6.4601 (6.4916) grad_norm 3.0098 (2.8425) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:27:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][220/625] eta 0:04:38 lr 0.000015 wd 0.0500 time 0.5794 (0.6866) data time 0.0009 (0.0093) model time 0.5785 (0.6773) loss 6.1146 (6.4746) grad_norm 2.1363 (2.8778) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:27:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][230/625] eta 0:04:26 lr 0.000015 wd 0.0500 time 0.5741 (0.6755) data time 0.0009 (0.0085) model time 0.5732 (0.6671) loss 7.2370 (6.4761) grad_norm 2.1582 (2.8291) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:27:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][240/625] eta 0:04:16 lr 0.000015 wd 0.0500 time 0.5796 (0.6664) data time 0.0007 (0.0079) model time 0.5789 (0.6585) loss 6.5241 (6.5096) grad_norm 2.6554 (2.8169) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:27:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][250/625] eta 0:04:07 lr 0.000015 wd 0.0500 time 0.5793 (0.6590) data time 0.0009 (0.0073) model time 0.5784 (0.6517) loss 7.1759 (6.5161) grad_norm 3.2918 (2.8244) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:27:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][260/625] eta 0:03:58 lr 0.000015 wd 0.0500 time 0.5765 (0.6526) data time 0.0009 (0.0068) model time 0.5756 (0.6458) loss 6.6514 (6.5126) grad_norm 1.8964 (2.8323) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:27:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][270/625] eta 0:03:49 lr 0.000015 wd 0.0500 time 0.5818 (0.6475) data time 0.0006 (0.0064) model time 0.5812 (0.6411) loss 5.6701 (6.5105) grad_norm 2.8541 (2.8451) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:27:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][280/625] eta 0:03:41 lr 0.000015 wd 0.0500 time 0.5902 (0.6432) data time 0.0009 (0.0060) model time 0.5893 (0.6372) loss 6.1926 (6.4940) grad_norm 3.5888 (2.8389) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:28:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][290/625] eta 0:03:34 lr 0.000015 wd 0.0500 time 0.5857 (0.6395) data time 0.0009 (0.0057) model time 0.5848 (0.6338) loss 5.9187 (6.5068) grad_norm 2.4079 (2.8729) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:28:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][300/625] eta 0:03:26 lr 0.000015 wd 0.0500 time 0.5818 (0.6362) data time 0.0008 (0.0054) model time 0.5810 (0.6308) loss 6.8458 (6.5197) grad_norm 2.4723 (2.8830) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:28:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][310/625] eta 0:03:19 lr 0.000015 wd 0.0500 time 0.5781 (0.6330) data time 0.0008 (0.0051) model time 0.5773 (0.6279) loss 7.0575 (6.5237) grad_norm 36.0407 (3.0690) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:28:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][320/625] eta 0:03:12 lr 0.000015 wd 0.0500 time 0.5786 (0.6301) data time 0.0006 (0.0049) model time 0.5779 (0.6252) loss 5.8251 (6.5064) grad_norm 3.7663 (3.0767) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:28:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][330/625] eta 0:03:05 lr 0.000015 wd 0.0500 time 0.5761 (0.6276) data time 0.0007 (0.0047) model time 0.5755 (0.6229) loss 7.0648 (6.5007) grad_norm 6.8477 (3.0876) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:28:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][340/625] eta 0:02:58 lr 0.000015 wd 0.0500 time 0.5820 (0.6253) data time 0.0007 (0.0045) model time 0.5813 (0.6208) loss 6.5378 (6.4882) grad_norm 4.0927 (3.0732) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:28:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][350/625] eta 0:02:51 lr 0.000015 wd 0.0500 time 0.5801 (0.6233) data time 0.0007 (0.0044) model time 0.5795 (0.6190) loss 6.7149 (6.4844) grad_norm 3.1936 (3.1230) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:28:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][360/625] eta 0:02:44 lr 0.000015 wd 0.0500 time 0.5829 (0.6215) data time 0.0009 (0.0042) model time 0.5820 (0.6173) loss 6.3312 (6.5049) grad_norm 2.7354 (3.1226) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:28:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][370/625] eta 0:02:38 lr 0.000015 wd 0.0500 time 0.5792 (0.6198) data time 0.0007 (0.0041) model time 0.5785 (0.6158) loss 5.9595 (6.5128) grad_norm 2.6549 (3.1429) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:28:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][380/625] eta 0:02:31 lr 0.000015 wd 0.0500 time 0.5760 (0.6182) data time 0.0009 (0.0039) model time 0.5751 (0.6142) loss 6.8552 (6.5156) grad_norm 3.9952 (3.1260) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:29:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][390/625] eta 0:02:24 lr 0.000015 wd 0.0500 time 0.5770 (0.6167) data time 0.0006 (0.0038) model time 0.5764 (0.6129) loss 5.3955 (6.4973) grad_norm 2.3616 (3.1083) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:29:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][400/625] eta 0:02:18 lr 0.000015 wd 0.0500 time 0.5845 (0.6153) data time 0.0006 (0.0037) model time 0.5840 (0.6116) loss 5.0432 (6.4825) grad_norm 1.9599 (3.0940) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:29:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][410/625] eta 0:02:12 lr 0.000015 wd 0.0500 time 0.5751 (0.6148) data time 0.0008 (0.0036) model time 0.5743 (0.6112) loss 6.5680 (6.4860) grad_norm 2.2212 (3.0951) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:29:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][420/625] eta 0:02:05 lr 0.000015 wd 0.0500 time 0.5826 (0.6137) data time 0.0007 (0.0035) model time 0.5820 (0.6102) loss 7.1159 (6.4874) grad_norm 3.1435 (3.0968) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:29:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][430/625] eta 0:01:59 lr 0.000015 wd 0.0500 time 0.5802 (0.6126) data time 0.0008 (0.0034) model time 0.5794 (0.6092) loss 5.2133 (6.4720) grad_norm 2.6690 (3.0927) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 10:29:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 10:29:26 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 10:29:29 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 10:32:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 10:32:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 10:33:33 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 10:33:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 10:33:47 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 10:33:47 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 10:33:48 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 10:33:48 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 291) +[2024-07-29 10:33:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 10:34:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][440/625] eta 0:09:11 lr 0.000015 wd 0.0500 time 0.5166 (2.9820) data time 0.0011 (0.1359) model time 0.5155 (2.8461) loss 6.7718 (6.6397) grad_norm 2.1663 (3.3098) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:34:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][450/625] eta 0:03:55 lr 0.000015 wd 0.0500 time 0.5150 (1.3444) data time 0.0011 (0.0461) model time 0.5139 (1.2983) loss 7.4525 (6.7269) grad_norm 16.4640 (3.8653) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:34:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][460/625] eta 0:02:48 lr 0.000015 wd 0.0500 time 0.5144 (1.0223) data time 0.0011 (0.0281) model time 0.5133 (0.9942) loss 7.2080 (6.7510) grad_norm 2.8361 (3.4374) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:34:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][470/625] eta 0:02:16 lr 0.000015 wd 0.0500 time 0.5238 (0.8811) data time 0.0010 (0.0204) model time 0.5228 (0.8608) loss 6.5757 (6.7494) grad_norm 1.9280 (3.2155) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:34:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][480/625] eta 0:01:57 lr 0.000015 wd 0.0500 time 0.5191 (0.8071) data time 0.0010 (0.0161) model time 0.5181 (0.7910) loss 7.4534 (6.7445) grad_norm 3.7795 (3.1244) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:34:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][490/625] eta 0:01:43 lr 0.000015 wd 0.0500 time 0.7591 (0.7645) data time 0.0007 (0.0135) model time 0.7584 (0.7510) loss 5.1128 (6.7065) grad_norm 2.4513 (3.3783) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 10:34:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][500/625] eta 0:01:31 lr 0.000015 wd 0.0500 time 0.5159 (0.7304) data time 0.0009 (0.0116) model time 0.5150 (0.7188) loss 6.9103 (6.6470) grad_norm 2.1462 (3.2764) loss_scale 512.0000 (275.6923) mem 22344MB +[2024-07-29 10:34:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][510/625] eta 0:01:20 lr 0.000015 wd 0.0500 time 0.5148 (0.7032) data time 0.0010 (0.0102) model time 0.5138 (0.6930) loss 6.5253 (6.6234) grad_norm 2.3664 (3.2144) loss_scale 512.0000 (307.2000) mem 22344MB +[2024-07-29 10:34:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][520/625] eta 0:01:11 lr 0.000014 wd 0.0500 time 0.5181 (0.6833) data time 0.0007 (0.0091) model time 0.5174 (0.6742) loss 6.3454 (6.6128) grad_norm 2.2014 (3.1675) loss_scale 512.0000 (331.2941) mem 22344MB +[2024-07-29 10:34:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][530/625] eta 0:01:03 lr 0.000014 wd 0.0500 time 0.5155 (0.6670) data time 0.0010 (0.0082) model time 0.5145 (0.6587) loss 6.8649 (6.5990) grad_norm 3.1519 (3.2390) loss_scale 512.0000 (350.3158) mem 22344MB +[2024-07-29 10:35:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][540/625] eta 0:00:55 lr 0.000014 wd 0.0500 time 0.5167 (0.6547) data time 0.0010 (0.0076) model time 0.5157 (0.6471) loss 6.0546 (6.6020) grad_norm 2.7554 (3.2315) loss_scale 512.0000 (365.7143) mem 22344MB +[2024-07-29 10:35:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][550/625] eta 0:00:48 lr 0.000014 wd 0.0500 time 0.5156 (0.6440) data time 0.0012 (0.0070) model time 0.5144 (0.6370) loss 5.8943 (6.5794) grad_norm 2.9538 (3.2143) loss_scale 512.0000 (378.4348) mem 22344MB +[2024-07-29 10:35:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][560/625] eta 0:00:41 lr 0.000014 wd 0.0500 time 0.5259 (0.6349) data time 0.0011 (0.0065) model time 0.5248 (0.6284) loss 7.2640 (6.5857) grad_norm 2.2464 (3.1601) loss_scale 512.0000 (389.1200) mem 22344MB +[2024-07-29 10:35:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][570/625] eta 0:00:34 lr 0.000014 wd 0.0500 time 0.5156 (0.6275) data time 0.0007 (0.0061) model time 0.5148 (0.6214) loss 6.2874 (6.5994) grad_norm 2.3100 (3.1207) loss_scale 512.0000 (398.2222) mem 22344MB +[2024-07-29 10:35:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][580/625] eta 0:00:27 lr 0.000014 wd 0.0500 time 0.5186 (0.6208) data time 0.0009 (0.0058) model time 0.5177 (0.6150) loss 7.6927 (6.6065) grad_norm 2.0047 (3.0878) loss_scale 512.0000 (406.0690) mem 22344MB +[2024-07-29 10:35:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][590/625] eta 0:00:21 lr 0.000014 wd 0.0500 time 0.5147 (0.6154) data time 0.0009 (0.0055) model time 0.5138 (0.6099) loss 7.6334 (6.6128) grad_norm 2.6731 (3.1046) loss_scale 512.0000 (412.9032) mem 22344MB +[2024-07-29 10:35:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][600/625] eta 0:00:15 lr 0.000014 wd 0.0500 time 0.5176 (0.6100) data time 0.0010 (0.0052) model time 0.5166 (0.6048) loss 7.9373 (6.6155) grad_norm 3.2203 (3.0815) loss_scale 512.0000 (418.9091) mem 22344MB +[2024-07-29 10:35:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][610/625] eta 0:00:09 lr 0.000014 wd 0.0500 time 0.5436 (0.6056) data time 0.0006 (0.0050) model time 0.5430 (0.6006) loss 6.5259 (6.6024) grad_norm 4.0598 (3.0651) loss_scale 512.0000 (424.2286) mem 22344MB +[2024-07-29 10:35:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [291/300][620/625] eta 0:00:03 lr 0.000014 wd 0.0500 time 0.5135 (0.6011) data time 0.0007 (0.0048) model time 0.5128 (0.5963) loss 7.2068 (6.5988) grad_norm 1.9928 (3.0862) loss_scale 512.0000 (428.9730) mem 22344MB +[2024-07-29 10:35:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 291 training takes 0:01:53 +[2024-07-29 10:35:48 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 10:35:52 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 10:35:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.570 (0.570) Loss 0.4868 (0.4868) Acc@1 90.527 (90.527) Acc@5 98.975 (98.975) Mem 22344MB +[2024-07-29 10:35:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.120 (0.161) Loss 0.7314 (0.5878) Acc@1 83.154 (88.281) Acc@5 97.217 (98.184) Mem 22344MB +[2024-07-29 10:35:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.143) Loss 0.8071 (0.6732) Acc@1 81.250 (85.740) Acc@5 96.191 (97.459) Mem 22344MB +[2024-07-29 10:35:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.399 Acc@5 97.441 +[2024-07-29 10:35:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.4% +[2024-07-29 10:35:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.987 (0.987) Loss 0.4897 (0.4897) Acc@1 90.381 (90.381) Acc@5 99.023 (99.023) Mem 22344MB +[2024-07-29 10:36:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.117 (0.220) Loss 0.7329 (0.5923) Acc@1 83.252 (88.290) Acc@5 97.314 (98.171) Mem 22344MB +[2024-07-29 10:36:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.171) Loss 0.8101 (0.6764) Acc@1 81.250 (85.672) Acc@5 96.338 (97.424) Mem 22344MB +[2024-07-29 10:36:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.301 Acc@5 97.415 +[2024-07-29 10:36:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.3% +[2024-07-29 10:36:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.30% +[2024-07-29 10:36:02 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 10:36:05 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 10:36:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][0/625] eta 0:18:16 lr 0.000014 wd 0.0500 time 1.7539 (1.7539) data time 0.5909 (0.5909) model time 0.0000 (0.0000) loss 5.8011 (5.8011) grad_norm 2.9061 (2.9061) loss_scale 512.0000 (512.0000) mem 22337MB +[2024-07-29 10:36:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][10/625] eta 0:06:38 lr 0.000014 wd 0.0500 time 0.5182 (0.6481) data time 0.0010 (0.0548) model time 0.0000 (0.0000) loss 6.8612 (6.2320) grad_norm 2.1465 (2.7234) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:36:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][20/625] eta 0:05:59 lr 0.000014 wd 0.0500 time 0.5166 (0.5940) data time 0.0011 (0.0292) model time 0.0000 (0.0000) loss 6.2367 (6.2548) grad_norm 2.8011 (3.1279) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:36:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][30/625] eta 0:05:43 lr 0.000014 wd 0.0500 time 0.5152 (0.5766) data time 0.0010 (0.0201) model time 0.0000 (0.0000) loss 6.8503 (6.3502) grad_norm 2.8643 (3.2535) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:36:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][40/625] eta 0:05:30 lr 0.000014 wd 0.0500 time 0.5153 (0.5648) data time 0.0012 (0.0155) model time 0.0000 (0.0000) loss 7.0240 (6.4618) grad_norm 4.0979 (3.1037) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:36:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][50/625] eta 0:05:22 lr 0.000014 wd 0.0500 time 0.5143 (0.5601) data time 0.0009 (0.0127) model time 0.0000 (0.0000) loss 7.7037 (6.5270) grad_norm 7.4873 (3.0805) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:36:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][60/625] eta 0:05:13 lr 0.000014 wd 0.0500 time 0.5834 (0.5542) data time 0.0009 (0.0108) model time 0.5825 (0.5232) loss 5.6951 (6.4731) grad_norm 1.8019 (2.9597) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:36:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][70/625] eta 0:05:05 lr 0.000014 wd 0.0500 time 0.5197 (0.5509) data time 0.0010 (0.0094) model time 0.5187 (0.5264) loss 5.4956 (6.4578) grad_norm 2.8595 (2.9690) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:36:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][80/625] eta 0:04:59 lr 0.000014 wd 0.0500 time 0.5157 (0.5496) data time 0.0007 (0.0084) model time 0.5149 (0.5306) loss 7.3235 (6.4239) grad_norm 2.6387 (2.9339) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:36:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][90/625] eta 0:04:54 lr 0.000014 wd 0.0500 time 0.5161 (0.5496) data time 0.0009 (0.0076) model time 0.5151 (0.5352) loss 7.3494 (6.4488) grad_norm 2.4098 (3.1644) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:37:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][100/625] eta 0:04:47 lr 0.000014 wd 0.0500 time 0.5195 (0.5481) data time 0.0010 (0.0069) model time 0.5184 (0.5348) loss 5.4620 (6.4359) grad_norm 2.4810 (3.1363) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:37:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][110/625] eta 0:04:41 lr 0.000014 wd 0.0500 time 0.5169 (0.5466) data time 0.0008 (0.0064) model time 0.5161 (0.5340) loss 5.9660 (6.4085) grad_norm 1.7262 (3.0937) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:37:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][120/625] eta 0:04:36 lr 0.000014 wd 0.0500 time 0.6862 (0.5466) data time 0.0011 (0.0060) model time 0.6851 (0.5356) loss 6.0463 (6.4180) grad_norm 4.4018 (3.0759) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:37:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][130/625] eta 0:04:29 lr 0.000014 wd 0.0500 time 0.5239 (0.5451) data time 0.0010 (0.0059) model time 0.5229 (0.5340) loss 6.0875 (6.4598) grad_norm 2.7729 (3.1021) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:37:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][140/625] eta 0:04:24 lr 0.000014 wd 0.0500 time 0.5165 (0.5444) data time 0.0009 (0.0056) model time 0.5156 (0.5340) loss 6.1227 (6.4757) grad_norm 2.6911 (3.0635) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:37:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][150/625] eta 0:04:18 lr 0.000014 wd 0.0500 time 0.5154 (0.5440) data time 0.0011 (0.0056) model time 0.5143 (0.5338) loss 6.4071 (6.4658) grad_norm 3.9826 (3.0437) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:37:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][160/625] eta 0:04:12 lr 0.000014 wd 0.0500 time 0.5174 (0.5439) data time 0.0009 (0.0054) model time 0.5165 (0.5344) loss 6.1897 (6.4889) grad_norm 3.4128 (3.0915) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:37:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][170/625] eta 0:04:07 lr 0.000014 wd 0.0500 time 0.6708 (0.5435) data time 0.0008 (0.0051) model time 0.6700 (0.5345) loss 6.8304 (6.4815) grad_norm 3.3050 (3.0606) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:37:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][180/625] eta 0:04:01 lr 0.000014 wd 0.0500 time 0.5202 (0.5428) data time 0.0010 (0.0049) model time 0.5192 (0.5341) loss 7.1595 (6.4861) grad_norm 3.7010 (3.0534) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:37:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][190/625] eta 0:03:56 lr 0.000014 wd 0.0500 time 0.6762 (0.5429) data time 0.0008 (0.0047) model time 0.6754 (0.5348) loss 6.0970 (6.4813) grad_norm 3.8965 (3.1633) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:37:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][200/625] eta 0:03:50 lr 0.000014 wd 0.0500 time 0.5149 (0.5423) data time 0.0012 (0.0045) model time 0.5136 (0.5345) loss 5.2244 (6.4507) grad_norm 2.1004 (3.1475) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:37:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][210/625] eta 0:03:45 lr 0.000014 wd 0.0500 time 0.5673 (0.5427) data time 0.0010 (0.0044) model time 0.5663 (0.5354) loss 6.2357 (6.4594) grad_norm 2.5380 (3.1248) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:38:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][220/625] eta 0:03:39 lr 0.000014 wd 0.0500 time 0.5325 (0.5425) data time 0.0007 (0.0043) model time 0.5317 (0.5354) loss 6.1886 (6.4583) grad_norm 2.9278 (3.1408) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:38:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][230/625] eta 0:03:34 lr 0.000014 wd 0.0500 time 0.5328 (0.5428) data time 0.0009 (0.0041) model time 0.5320 (0.5362) loss 6.3937 (6.4682) grad_norm 2.2817 (3.1566) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:38:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][240/625] eta 0:03:28 lr 0.000014 wd 0.0500 time 0.5164 (0.5427) data time 0.0016 (0.0040) model time 0.5148 (0.5363) loss 6.7588 (6.4759) grad_norm 4.8873 (3.1407) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:38:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 10:38:17 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 10:38:18 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 10:41:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 10:41:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 10:41:27 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 10:41:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 10:41:37 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 10:41:37 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 10:41:37 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 10:41:38 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 292) +[2024-07-29 10:41:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 10:41:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 10:41:51 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 10:41:56 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 10:46:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 10:46:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 10:46:52 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 10:47:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 10:47:10 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 10:47:10 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 10:47:11 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 10:47:11 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 292) +[2024-07-29 10:47:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 10:47:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][250/625] eta 0:12:41 lr 0.000014 wd 0.0500 time 0.5698 (2.0306) data time 0.0011 (0.1417) model time 0.5687 (1.8889) loss 6.8199 (6.9199) grad_norm 5.1484 (3.4091) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:47:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][260/625] eta 0:07:08 lr 0.000014 wd 0.0500 time 0.5862 (1.1747) data time 0.0009 (0.0589) model time 0.5853 (1.1158) loss 6.3010 (6.6659) grad_norm 3.5961 (3.3095) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:47:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][270/625] eta 0:05:37 lr 0.000014 wd 0.0500 time 0.5673 (0.9514) data time 0.0008 (0.0374) model time 0.5666 (0.9140) loss 6.4741 (6.6994) grad_norm 3.8046 (3.1985) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:47:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][280/625] eta 0:04:52 lr 0.000014 wd 0.0500 time 0.5693 (0.8483) data time 0.0010 (0.0276) model time 0.5683 (0.8207) loss 6.0471 (6.6893) grad_norm 2.4718 (3.1730) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:47:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][290/625] eta 0:04:24 lr 0.000014 wd 0.0500 time 0.5719 (0.7893) data time 0.0007 (0.0219) model time 0.5712 (0.7673) loss 6.4340 (6.6590) grad_norm 2.7887 (3.4951) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:47:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][300/625] eta 0:04:06 lr 0.000014 wd 0.0500 time 0.5688 (0.7580) data time 0.0009 (0.0182) model time 0.5680 (0.7397) loss 6.3691 (6.6219) grad_norm 4.5020 (3.3158) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:48:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][310/625] eta 0:03:50 lr 0.000014 wd 0.0500 time 0.5782 (0.7306) data time 0.0009 (0.0156) model time 0.5774 (0.7150) loss 6.9714 (6.5756) grad_norm 3.4972 (3.2860) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:48:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][320/625] eta 0:03:36 lr 0.000014 wd 0.0500 time 0.5760 (0.7108) data time 0.0009 (0.0137) model time 0.5751 (0.6971) loss 6.1856 (6.5490) grad_norm 2.6194 (3.4087) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:48:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][330/625] eta 0:03:25 lr 0.000014 wd 0.0500 time 0.5733 (0.6952) data time 0.0009 (0.0123) model time 0.5723 (0.6829) loss 6.3308 (6.5349) grad_norm 1.9914 (3.3500) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:48:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][340/625] eta 0:03:14 lr 0.000014 wd 0.0500 time 0.5737 (0.6827) data time 0.0009 (0.0111) model time 0.5728 (0.6716) loss 7.4942 (6.5520) grad_norm 3.3153 (3.2781) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:48:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][350/625] eta 0:03:04 lr 0.000014 wd 0.0500 time 0.5739 (0.6725) data time 0.0006 (0.0101) model time 0.5733 (0.6623) loss 6.9219 (6.5829) grad_norm 2.0181 (3.2372) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:48:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][360/625] eta 0:02:55 lr 0.000014 wd 0.0500 time 0.5731 (0.6640) data time 0.0009 (0.0093) model time 0.5723 (0.6547) loss 7.0616 (6.5778) grad_norm 2.4391 (3.1642) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:48:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][370/625] eta 0:02:47 lr 0.000014 wd 0.0500 time 0.5726 (0.6568) data time 0.0008 (0.0087) model time 0.5717 (0.6482) loss 7.2359 (6.5694) grad_norm 2.0924 (3.1499) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:48:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][380/625] eta 0:02:39 lr 0.000014 wd 0.0500 time 0.5741 (0.6507) data time 0.0009 (0.0081) model time 0.5732 (0.6426) loss 5.2679 (6.5714) grad_norm 7.7035 (3.1509) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:48:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][390/625] eta 0:02:31 lr 0.000014 wd 0.0500 time 0.5725 (0.6455) data time 0.0008 (0.0076) model time 0.5717 (0.6379) loss 5.9862 (6.5572) grad_norm 2.9320 (3.1514) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:48:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][400/625] eta 0:02:24 lr 0.000014 wd 0.0500 time 0.5739 (0.6410) data time 0.0009 (0.0072) model time 0.5730 (0.6338) loss 5.9564 (6.5569) grad_norm 2.2949 (3.1605) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:49:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][410/625] eta 0:02:16 lr 0.000014 wd 0.0500 time 0.5740 (0.6370) data time 0.0008 (0.0068) model time 0.5732 (0.6302) loss 7.0875 (6.5764) grad_norm 2.6346 (3.1472) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:49:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][420/625] eta 0:02:09 lr 0.000014 wd 0.0500 time 0.5727 (0.6334) data time 0.0009 (0.0065) model time 0.5719 (0.6270) loss 6.9974 (6.5652) grad_norm 2.9598 (3.1178) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:49:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][430/625] eta 0:02:02 lr 0.000014 wd 0.0500 time 0.5711 (0.6302) data time 0.0006 (0.0062) model time 0.5705 (0.6241) loss 6.4410 (6.5589) grad_norm 2.2999 (3.1145) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:49:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][440/625] eta 0:01:56 lr 0.000014 wd 0.0500 time 0.5718 (0.6273) data time 0.0007 (0.0059) model time 0.5711 (0.6214) loss 6.7531 (6.5512) grad_norm 3.1496 (3.1132) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:49:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][450/625] eta 0:01:49 lr 0.000014 wd 0.0500 time 0.5733 (0.6247) data time 0.0006 (0.0057) model time 0.5727 (0.6190) loss 6.6330 (6.5331) grad_norm 2.3013 (3.0835) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:49:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][460/625] eta 0:01:42 lr 0.000014 wd 0.0500 time 0.5753 (0.6224) data time 0.0007 (0.0054) model time 0.5746 (0.6170) loss 7.1053 (6.5460) grad_norm 2.7170 (3.0686) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:49:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][470/625] eta 0:01:36 lr 0.000014 wd 0.0500 time 0.5747 (0.6203) data time 0.0008 (0.0052) model time 0.5739 (0.6151) loss 6.3084 (6.5498) grad_norm 2.8463 (3.0565) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:49:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][480/625] eta 0:01:29 lr 0.000014 wd 0.0500 time 0.5713 (0.6184) data time 0.0007 (0.0051) model time 0.5706 (0.6133) loss 6.1425 (6.5524) grad_norm 4.4778 (3.0322) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:49:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][490/625] eta 0:01:23 lr 0.000014 wd 0.0500 time 0.5715 (0.6166) data time 0.0008 (0.0049) model time 0.5706 (0.6117) loss 6.9616 (6.5466) grad_norm 2.6054 (3.0316) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:49:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][500/625] eta 0:01:16 lr 0.000014 wd 0.0500 time 0.5697 (0.6149) data time 0.0007 (0.0047) model time 0.5690 (0.6101) loss 5.2126 (6.5271) grad_norm 2.4443 (3.0159) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:49:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][510/625] eta 0:01:10 lr 0.000014 wd 0.0500 time 0.5745 (0.6133) data time 0.0006 (0.0046) model time 0.5739 (0.6087) loss 5.7449 (6.5160) grad_norm 2.7480 (3.0088) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:50:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][520/625] eta 0:01:04 lr 0.000014 wd 0.0500 time 0.5722 (0.6128) data time 0.0009 (0.0044) model time 0.5714 (0.6084) loss 6.5230 (6.5223) grad_norm 3.4067 (3.0009) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:50:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][530/625] eta 0:00:58 lr 0.000014 wd 0.0500 time 0.5746 (0.6115) data time 0.0006 (0.0043) model time 0.5740 (0.6072) loss 7.7787 (6.5305) grad_norm 2.0744 (3.0013) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:50:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][540/625] eta 0:00:51 lr 0.000014 wd 0.0500 time 0.5732 (0.6104) data time 0.0009 (0.0042) model time 0.5723 (0.6062) loss 6.0638 (6.5276) grad_norm 3.8217 (2.9970) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:50:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][550/625] eta 0:00:45 lr 0.000014 wd 0.0500 time 0.5749 (0.6094) data time 0.0007 (0.0041) model time 0.5742 (0.6053) loss 6.7854 (6.5255) grad_norm 2.2215 (2.9916) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:50:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][560/625] eta 0:00:39 lr 0.000014 wd 0.0500 time 0.5739 (0.6084) data time 0.0008 (0.0040) model time 0.5731 (0.6044) loss 7.8438 (6.5451) grad_norm 3.4035 (2.9903) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:50:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][570/625] eta 0:00:33 lr 0.000014 wd 0.0500 time 0.5741 (0.6074) data time 0.0006 (0.0039) model time 0.5735 (0.6035) loss 5.7951 (6.5522) grad_norm 2.9585 (2.9883) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:50:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][580/625] eta 0:00:27 lr 0.000014 wd 0.0500 time 0.5745 (0.6064) data time 0.0008 (0.0038) model time 0.5736 (0.6026) loss 5.9101 (6.5567) grad_norm 2.6593 (2.9862) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:50:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][590/625] eta 0:00:21 lr 0.000014 wd 0.0500 time 0.5742 (0.6055) data time 0.0006 (0.0037) model time 0.5736 (0.6018) loss 5.9855 (6.5600) grad_norm 2.3999 (2.9967) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:50:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][600/625] eta 0:00:15 lr 0.000014 wd 0.0500 time 0.5747 (0.6046) data time 0.0010 (0.0036) model time 0.5737 (0.6010) loss 6.8199 (6.5533) grad_norm 2.4365 (2.9870) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:50:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][610/625] eta 0:00:09 lr 0.000014 wd 0.0500 time 0.5754 (0.6039) data time 0.0004 (0.0036) model time 0.5749 (0.6003) loss 5.3868 (6.5480) grad_norm 3.1489 (2.9780) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:51:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [292/300][620/625] eta 0:00:03 lr 0.000014 wd 0.0500 time 0.5720 (0.6031) data time 0.0006 (0.0035) model time 0.5713 (0.5996) loss 5.5617 (6.5448) grad_norm 2.1920 (2.9974) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:51:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 292 training takes 0:03:49 +[2024-07-29 10:51:04 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 10:51:10 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 10:51:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.469 (0.469) Loss 0.4834 (0.4834) Acc@1 90.527 (90.527) Acc@5 98.975 (98.975) Mem 22341MB +[2024-07-29 10:51:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.7383 (0.5861) Acc@1 82.861 (88.303) Acc@5 97.168 (98.149) Mem 22341MB +[2024-07-29 10:51:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8037 (0.6713) Acc@1 81.445 (85.791) Acc@5 96.240 (97.391) Mem 22341MB +[2024-07-29 10:51:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.425 Acc@5 97.377 +[2024-07-29 10:51:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.4% +[2024-07-29 10:51:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 85.42% +[2024-07-29 10:51:15 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-29 10:51:17 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-29 10:51:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.473 (0.473) Loss 0.4893 (0.4893) Acc@1 90.430 (90.430) Acc@5 99.023 (99.023) Mem 22341MB +[2024-07-29 10:51:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.124 (0.157) Loss 0.7334 (0.5919) Acc@1 83.154 (88.277) Acc@5 97.314 (98.162) Mem 22341MB +[2024-07-29 10:51:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8105 (0.6763) Acc@1 81.299 (85.682) Acc@5 96.338 (97.417) Mem 22341MB +[2024-07-29 10:51:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.307 Acc@5 97.409 +[2024-07-29 10:51:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.3% +[2024-07-29 10:51:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.31% +[2024-07-29 10:51:20 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 10:51:21 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 10:51:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][0/625] eta 0:11:33 lr 0.000014 wd 0.0500 time 1.1099 (1.1099) data time 0.4358 (0.4358) model time 0.0000 (0.0000) loss 6.9101 (6.9101) grad_norm 2.4373 (2.4373) loss_scale 512.0000 (512.0000) mem 22337MB +[2024-07-29 10:51:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][10/625] eta 0:06:26 lr 0.000014 wd 0.0500 time 0.5728 (0.6283) data time 0.0007 (0.0418) model time 0.0000 (0.0000) loss 5.8381 (6.2106) grad_norm 1.9635 (2.6239) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:51:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][20/625] eta 0:06:06 lr 0.000014 wd 0.0500 time 0.5767 (0.6065) data time 0.0007 (0.0223) model time 0.0000 (0.0000) loss 6.8532 (6.3786) grad_norm 3.5115 (2.5807) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:51:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][30/625] eta 0:05:54 lr 0.000014 wd 0.0500 time 0.5737 (0.5957) data time 0.0009 (0.0154) model time 0.0000 (0.0000) loss 5.5634 (6.3996) grad_norm 3.7197 (2.5928) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:51:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][40/625] eta 0:05:45 lr 0.000014 wd 0.0500 time 0.5752 (0.5910) data time 0.0007 (0.0118) model time 0.0000 (0.0000) loss 6.5356 (6.4137) grad_norm 2.4415 (2.6272) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:51:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][50/625] eta 0:05:38 lr 0.000014 wd 0.0500 time 0.5756 (0.5884) data time 0.0007 (0.0097) model time 0.0000 (0.0000) loss 5.3131 (6.4719) grad_norm 16.7090 (2.9074) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:51:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][60/625] eta 0:05:31 lr 0.000014 wd 0.0500 time 0.5750 (0.5861) data time 0.0007 (0.0082) model time 0.5743 (0.5733) loss 6.7215 (6.5081) grad_norm 2.6388 (3.0632) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:52:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][70/625] eta 0:05:24 lr 0.000014 wd 0.0500 time 0.5791 (0.5847) data time 0.0009 (0.0072) model time 0.5782 (0.5745) loss 5.3558 (6.4841) grad_norm 2.3717 (2.9918) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:52:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][80/625] eta 0:05:18 lr 0.000014 wd 0.0500 time 0.5765 (0.5841) data time 0.0007 (0.0064) model time 0.5759 (0.5761) loss 6.1301 (6.4388) grad_norm 2.8605 (3.2761) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:52:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][90/625] eta 0:05:12 lr 0.000014 wd 0.0500 time 0.5755 (0.5832) data time 0.0009 (0.0058) model time 0.5746 (0.5757) loss 6.0048 (6.3712) grad_norm 3.4272 (3.2094) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:52:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][100/625] eta 0:05:05 lr 0.000014 wd 0.0500 time 0.5722 (0.5824) data time 0.0007 (0.0053) model time 0.5715 (0.5753) loss 7.2481 (6.3891) grad_norm 1.6987 (3.2101) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:52:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][110/625] eta 0:04:59 lr 0.000014 wd 0.0500 time 0.5763 (0.5817) data time 0.0008 (0.0049) model time 0.5755 (0.5751) loss 7.0897 (6.4086) grad_norm 2.8420 (3.2200) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:52:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][120/625] eta 0:04:54 lr 0.000014 wd 0.0500 time 0.5767 (0.5825) data time 0.0009 (0.0046) model time 0.5759 (0.5773) loss 6.2408 (6.4031) grad_norm 2.4153 (3.1803) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:52:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][130/625] eta 0:04:48 lr 0.000014 wd 0.0500 time 0.5740 (0.5818) data time 0.0007 (0.0043) model time 0.5733 (0.5768) loss 6.1744 (6.4290) grad_norm 1.9815 (3.1502) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:52:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][140/625] eta 0:04:42 lr 0.000014 wd 0.0500 time 0.5778 (0.5815) data time 0.0008 (0.0041) model time 0.5770 (0.5767) loss 5.8202 (6.4248) grad_norm 2.4941 (3.1216) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:52:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][150/625] eta 0:04:36 lr 0.000014 wd 0.0500 time 0.5783 (0.5811) data time 0.0009 (0.0038) model time 0.5775 (0.5765) loss 6.5537 (6.4168) grad_norm 2.4655 (3.0952) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:52:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][160/625] eta 0:04:30 lr 0.000014 wd 0.0500 time 0.5754 (0.5808) data time 0.0006 (0.0037) model time 0.5748 (0.5765) loss 5.4457 (6.4131) grad_norm 2.8726 (3.1683) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:53:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][170/625] eta 0:04:24 lr 0.000014 wd 0.0500 time 0.5730 (0.5805) data time 0.0007 (0.0035) model time 0.5722 (0.5763) loss 7.4458 (6.4311) grad_norm 2.3076 (3.1459) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:53:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][180/625] eta 0:04:18 lr 0.000014 wd 0.0500 time 0.5739 (0.5801) data time 0.0007 (0.0033) model time 0.5732 (0.5760) loss 6.9031 (6.4559) grad_norm 3.5778 (3.1345) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:53:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][190/625] eta 0:04:12 lr 0.000014 wd 0.0500 time 0.5784 (0.5798) data time 0.0009 (0.0032) model time 0.5775 (0.5758) loss 6.2679 (6.4571) grad_norm 2.2730 (3.1266) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:53:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][200/625] eta 0:04:06 lr 0.000014 wd 0.0500 time 0.5751 (0.5796) data time 0.0007 (0.0031) model time 0.5744 (0.5757) loss 6.4975 (6.4682) grad_norm 2.4909 (3.1108) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:53:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][210/625] eta 0:04:00 lr 0.000014 wd 0.0500 time 0.5768 (0.5794) data time 0.0009 (0.0030) model time 0.5759 (0.5757) loss 7.2381 (6.4848) grad_norm 3.6272 (3.0975) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:53:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][220/625] eta 0:03:54 lr 0.000014 wd 0.0500 time 0.5753 (0.5793) data time 0.0007 (0.0029) model time 0.5746 (0.5757) loss 5.9289 (6.4769) grad_norm 1.9523 (3.1175) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:53:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][230/625] eta 0:03:48 lr 0.000014 wd 0.0500 time 0.5770 (0.5791) data time 0.0008 (0.0028) model time 0.5761 (0.5757) loss 5.9238 (6.4698) grad_norm 2.6517 (3.1391) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:53:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][240/625] eta 0:03:43 lr 0.000014 wd 0.0500 time 0.5754 (0.5795) data time 0.0009 (0.0027) model time 0.5745 (0.5762) loss 6.3592 (6.4838) grad_norm 2.5135 (3.1235) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:53:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][250/625] eta 0:03:37 lr 0.000014 wd 0.0500 time 0.5695 (0.5793) data time 0.0009 (0.0027) model time 0.5686 (0.5761) loss 6.6917 (6.4850) grad_norm 2.5633 (3.1056) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:53:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][260/625] eta 0:03:31 lr 0.000014 wd 0.0500 time 0.5847 (0.5791) data time 0.0008 (0.0026) model time 0.5839 (0.5760) loss 5.3942 (6.4799) grad_norm 6.3358 (3.1132) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:53:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][270/625] eta 0:03:25 lr 0.000014 wd 0.0500 time 0.5752 (0.5790) data time 0.0008 (0.0025) model time 0.5744 (0.5759) loss 5.1814 (6.4742) grad_norm 1.9030 (3.1010) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:54:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][280/625] eta 0:03:19 lr 0.000014 wd 0.0500 time 0.5762 (0.5789) data time 0.0009 (0.0025) model time 0.5753 (0.5759) loss 7.3324 (6.4648) grad_norm 2.5808 (3.0838) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:54:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][290/625] eta 0:03:13 lr 0.000014 wd 0.0500 time 0.5800 (0.5788) data time 0.0009 (0.0024) model time 0.5791 (0.5759) loss 6.5119 (6.4732) grad_norm 3.2196 (3.0840) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:54:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][300/625] eta 0:03:08 lr 0.000014 wd 0.0500 time 0.5744 (0.5787) data time 0.0009 (0.0024) model time 0.5735 (0.5759) loss 6.1713 (6.4721) grad_norm 1.9432 (3.0709) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:54:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][310/625] eta 0:03:02 lr 0.000014 wd 0.0500 time 0.5773 (0.5787) data time 0.0008 (0.0023) model time 0.5765 (0.5759) loss 6.4399 (6.4634) grad_norm 2.4872 (3.0822) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:54:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][320/625] eta 0:02:56 lr 0.000014 wd 0.0500 time 0.5731 (0.5786) data time 0.0007 (0.0023) model time 0.5724 (0.5758) loss 6.0789 (6.4524) grad_norm 5.3814 (3.0788) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:54:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][330/625] eta 0:02:50 lr 0.000014 wd 0.0500 time 0.5794 (0.5784) data time 0.0007 (0.0022) model time 0.5787 (0.5758) loss 5.5921 (6.4555) grad_norm 3.9836 (3.0605) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:54:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][340/625] eta 0:02:45 lr 0.000014 wd 0.0500 time 0.5728 (0.5790) data time 0.0007 (0.0022) model time 0.5721 (0.5764) loss 6.0867 (6.4519) grad_norm 3.0790 (3.0573) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:54:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][350/625] eta 0:02:39 lr 0.000014 wd 0.0500 time 0.5755 (0.5788) data time 0.0008 (0.0021) model time 0.5747 (0.5764) loss 7.4632 (6.4603) grad_norm 2.0817 (3.0359) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:54:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][360/625] eta 0:02:33 lr 0.000014 wd 0.0500 time 0.5781 (0.5788) data time 0.0009 (0.0021) model time 0.5772 (0.5763) loss 5.6841 (6.4632) grad_norm 2.0321 (3.0362) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:54:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][370/625] eta 0:02:27 lr 0.000014 wd 0.0500 time 0.5743 (0.5787) data time 0.0006 (0.0021) model time 0.5737 (0.5762) loss 6.9032 (6.4668) grad_norm 3.5142 (3.0413) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:55:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][380/625] eta 0:02:21 lr 0.000014 wd 0.0500 time 0.5768 (0.5786) data time 0.0007 (0.0020) model time 0.5761 (0.5762) loss 6.8602 (6.4764) grad_norm 1.9117 (3.0361) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:55:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][390/625] eta 0:02:15 lr 0.000014 wd 0.0500 time 0.5732 (0.5785) data time 0.0007 (0.0020) model time 0.5725 (0.5762) loss 6.3142 (6.4830) grad_norm 2.2039 (3.0223) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:55:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][400/625] eta 0:02:10 lr 0.000014 wd 0.0500 time 0.5803 (0.5784) data time 0.0006 (0.0020) model time 0.5797 (0.5761) loss 6.9265 (6.4868) grad_norm 3.5781 (3.0102) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:55:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][410/625] eta 0:02:04 lr 0.000014 wd 0.0500 time 0.5753 (0.5783) data time 0.0009 (0.0020) model time 0.5745 (0.5760) loss 7.3509 (6.4809) grad_norm 2.6848 (3.0011) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:55:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][420/625] eta 0:01:58 lr 0.000013 wd 0.0500 time 0.5744 (0.5782) data time 0.0006 (0.0019) model time 0.5737 (0.5759) loss 5.7096 (6.4788) grad_norm 6.2594 (3.0038) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:55:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][430/625] eta 0:01:52 lr 0.000013 wd 0.0500 time 0.5781 (0.5782) data time 0.0008 (0.0019) model time 0.5773 (0.5759) loss 7.0414 (6.4700) grad_norm 5.0538 (3.0076) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:55:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][440/625] eta 0:01:46 lr 0.000013 wd 0.0500 time 0.5755 (0.5781) data time 0.0007 (0.0019) model time 0.5748 (0.5759) loss 5.7646 (6.4749) grad_norm 3.0219 (3.0042) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:55:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][450/625] eta 0:01:41 lr 0.000013 wd 0.0500 time 0.5753 (0.5780) data time 0.0006 (0.0019) model time 0.5746 (0.5759) loss 6.0127 (6.4642) grad_norm 2.3108 (3.0077) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:55:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][460/625] eta 0:01:35 lr 0.000013 wd 0.0500 time 0.5724 (0.5783) data time 0.0008 (0.0018) model time 0.5716 (0.5762) loss 5.6843 (6.4629) grad_norm 3.3267 (3.0009) loss_scale 512.0000 (512.0000) mem 22339MB +[2024-07-29 10:55:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 10:55:53 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 10:55:54 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 10:58:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 10:58:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 10:58:21 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 10:58:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 10:58:47 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 10:58:47 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 10:58:47 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 10:58:47 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 293) +[2024-07-29 10:58:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 10:59:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][470/625] eta 0:18:18 lr 0.000013 wd 0.0500 time 2.1700 (7.0896) data time 0.0010 (0.3613) model time 2.1690 (6.7284) loss 6.4488 (7.0250) grad_norm 2.0284 (2.3693) loss_scale 512.0000 (512.0000) mem 22341MB +[2024-07-29 10:59:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][480/625] eta 0:04:04 lr 0.000013 wd 0.0500 time 0.6043 (1.6832) data time 0.0008 (0.0611) model time 0.6035 (1.6221) loss 6.0437 (6.7742) grad_norm 2.5463 (2.6571) loss_scale 512.0000 (512.0000) mem 22345MB +[2024-07-29 10:59:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][490/625] eta 0:02:40 lr 0.000013 wd 0.0500 time 0.5973 (1.1910) data time 0.0010 (0.0338) model time 0.5963 (1.1572) loss 6.7658 (6.8345) grad_norm 4.6461 (2.8773) loss_scale 512.0000 (512.0000) mem 22345MB +[2024-07-29 10:59:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][500/625] eta 0:02:05 lr 0.000013 wd 0.0500 time 0.5986 (1.0049) data time 0.0008 (0.0235) model time 0.5978 (0.9814) loss 6.5753 (6.8198) grad_norm 2.7131 (2.8118) loss_scale 512.0000 (512.0000) mem 22345MB +[2024-07-29 10:59:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][510/625] eta 0:01:44 lr 0.000013 wd 0.0500 time 0.5981 (0.9078) data time 0.0010 (0.0182) model time 0.5971 (0.8896) loss 7.0633 (6.7387) grad_norm 3.2031 (2.7352) loss_scale 512.0000 (512.0000) mem 22345MB +[2024-07-29 10:59:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][520/625] eta 0:01:29 lr 0.000013 wd 0.0500 time 0.5362 (0.8514) data time 0.0008 (0.0149) model time 0.5355 (0.8365) loss 5.8026 (6.6699) grad_norm 4.3092 (2.9560) loss_scale 512.0000 (512.0000) mem 22345MB +[2024-07-29 10:59:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][530/625] eta 0:01:17 lr 0.000013 wd 0.0500 time 0.6051 (0.8148) data time 0.0008 (0.0127) model time 0.6042 (0.8022) loss 7.1789 (6.6097) grad_norm 2.0080 (2.9991) loss_scale 512.0000 (512.0000) mem 22345MB +[2024-07-29 10:59:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][540/625] eta 0:01:06 lr 0.000013 wd 0.0500 time 0.6074 (0.7857) data time 0.0011 (0.0110) model time 0.6064 (0.7747) loss 6.3685 (6.5635) grad_norm 1.8238 (2.9336) loss_scale 512.0000 (512.0000) mem 22345MB +[2024-07-29 10:59:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][550/625] eta 0:00:57 lr 0.000013 wd 0.0500 time 0.6066 (0.7639) data time 0.0010 (0.0098) model time 0.6056 (0.7541) loss 6.5058 (6.5536) grad_norm 2.2369 (3.0110) loss_scale 512.0000 (512.0000) mem 22345MB +[2024-07-29 11:00:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][560/625] eta 0:00:48 lr 0.000013 wd 0.0500 time 0.6100 (0.7469) data time 0.0008 (0.0089) model time 0.6092 (0.7380) loss 5.1739 (6.5276) grad_norm 2.0188 (2.9860) loss_scale 512.0000 (512.0000) mem 22345MB +[2024-07-29 11:00:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][570/625] eta 0:00:40 lr 0.000013 wd 0.0500 time 0.5994 (0.7324) data time 0.0008 (0.0081) model time 0.5986 (0.7243) loss 6.3313 (6.5412) grad_norm 7.2849 (3.2812) loss_scale 512.0000 (512.0000) mem 22345MB +[2024-07-29 11:00:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][580/625] eta 0:00:32 lr 0.000013 wd 0.0500 time 0.5952 (0.7203) data time 0.0010 (0.0075) model time 0.5942 (0.7128) loss 6.7850 (6.5320) grad_norm 3.0674 (3.2217) loss_scale 512.0000 (512.0000) mem 22345MB +[2024-07-29 11:00:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][590/625] eta 0:00:24 lr 0.000013 wd 0.0500 time 0.5985 (0.7104) data time 0.0008 (0.0069) model time 0.5977 (0.7035) loss 6.1945 (6.5536) grad_norm 3.0785 (3.2378) loss_scale 512.0000 (512.0000) mem 22345MB +[2024-07-29 11:00:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][600/625] eta 0:00:17 lr 0.000013 wd 0.0500 time 0.6125 (0.7022) data time 0.0010 (0.0065) model time 0.6115 (0.6957) loss 5.6583 (6.5505) grad_norm 2.2107 (3.2102) loss_scale 512.0000 (512.0000) mem 22345MB +[2024-07-29 11:00:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][610/625] eta 0:00:10 lr 0.000013 wd 0.0500 time 0.6065 (0.6956) data time 0.0008 (0.0061) model time 0.6057 (0.6895) loss 6.8978 (6.5394) grad_norm 2.5432 (3.2052) loss_scale 512.0000 (512.0000) mem 22345MB +[2024-07-29 11:00:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [293/300][620/625] eta 0:00:03 lr 0.000013 wd 0.0500 time 0.6171 (0.6900) data time 0.0008 (0.0058) model time 0.6163 (0.6842) loss 7.2440 (6.5343) grad_norm 2.3942 (3.1651) loss_scale 512.0000 (512.0000) mem 22345MB +[2024-07-29 11:00:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 293 training takes 0:01:47 +[2024-07-29 11:00:39 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 11:00:47 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 11:00:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.525 (0.525) Loss 0.4885 (0.4885) Acc@1 90.527 (90.527) Acc@5 98.975 (98.975) Mem 22345MB +[2024-07-29 11:00:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.162) Loss 0.7363 (0.5877) Acc@1 83.105 (88.326) Acc@5 97.363 (98.211) Mem 22345MB +[2024-07-29 11:00:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.146) Loss 0.8081 (0.6731) Acc@1 81.348 (85.754) Acc@5 96.240 (97.452) Mem 22345MB +[2024-07-29 11:00:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.407 Acc@5 97.437 +[2024-07-29 11:00:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.4% +[2024-07-29 11:00:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.829 (0.829) Loss 0.4895 (0.4895) Acc@1 90.430 (90.430) Acc@5 99.023 (99.023) Mem 22345MB +[2024-07-29 11:00:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.192) Loss 0.7339 (0.5918) Acc@1 83.105 (88.277) Acc@5 97.314 (98.171) Mem 22345MB +[2024-07-29 11:00:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.160) Loss 0.8101 (0.6760) Acc@1 81.348 (85.689) Acc@5 96.338 (97.424) Mem 22345MB +[2024-07-29 11:00:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.317 Acc@5 97.417 +[2024-07-29 11:00:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.3% +[2024-07-29 11:00:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.32% +[2024-07-29 11:00:57 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 11:01:04 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 11:01:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][0/625] eta 0:13:34 lr 0.000013 wd 0.0500 time 1.3026 (1.3026) data time 0.5142 (0.5142) model time 0.0000 (0.0000) loss 6.1405 (6.1405) grad_norm 2.5241 (2.5241) loss_scale 512.0000 (512.0000) mem 22337MB +[2024-07-29 11:01:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][10/625] eta 0:06:53 lr 0.000013 wd 0.0500 time 0.6058 (0.6721) data time 0.0011 (0.0477) model time 0.0000 (0.0000) loss 5.9474 (6.5163) grad_norm 2.6676 (3.2974) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:01:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][20/625] eta 0:06:27 lr 0.000013 wd 0.0500 time 0.6270 (0.6413) data time 0.0012 (0.0255) model time 0.0000 (0.0000) loss 6.5669 (6.4134) grad_norm 1.9116 (3.0513) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:01:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][30/625] eta 0:06:13 lr 0.000013 wd 0.0500 time 0.6004 (0.6274) data time 0.0009 (0.0178) model time 0.0000 (0.0000) loss 6.2323 (6.3985) grad_norm 4.5865 (3.1586) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:01:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][40/625] eta 0:06:03 lr 0.000013 wd 0.0500 time 0.5978 (0.6209) data time 0.0008 (0.0137) model time 0.0000 (0.0000) loss 6.6947 (6.3977) grad_norm 3.6945 (3.2365) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:01:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][50/625] eta 0:05:54 lr 0.000013 wd 0.0500 time 0.6105 (0.6171) data time 0.0008 (0.0112) model time 0.0000 (0.0000) loss 6.5135 (6.3694) grad_norm 2.4149 (3.1860) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:01:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][60/625] eta 0:05:48 lr 0.000013 wd 0.0500 time 0.6085 (0.6161) data time 0.0008 (0.0096) model time 0.6077 (0.6098) loss 6.5522 (6.3405) grad_norm 2.3149 (3.1724) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:01:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][70/625] eta 0:05:41 lr 0.000013 wd 0.0500 time 0.6066 (0.6148) data time 0.0011 (0.0084) model time 0.6055 (0.6079) loss 6.8689 (6.4162) grad_norm 2.3541 (3.1462) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:01:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][80/625] eta 0:05:34 lr 0.000013 wd 0.0500 time 0.6067 (0.6137) data time 0.0009 (0.0074) model time 0.6059 (0.6069) loss 6.1644 (6.4432) grad_norm 2.2347 (3.1182) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:02:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][90/625] eta 0:05:27 lr 0.000013 wd 0.0500 time 0.6078 (0.6128) data time 0.0010 (0.0067) model time 0.6068 (0.6064) loss 6.6032 (6.4240) grad_norm 2.1829 (3.0401) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:02:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][100/625] eta 0:05:21 lr 0.000013 wd 0.0500 time 0.6009 (0.6116) data time 0.0007 (0.0062) model time 0.6001 (0.6051) loss 5.7518 (6.3915) grad_norm 2.2464 (2.9754) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:02:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][110/625] eta 0:05:14 lr 0.000013 wd 0.0500 time 0.6030 (0.6108) data time 0.0008 (0.0057) model time 0.6022 (0.6044) loss 5.5912 (6.3897) grad_norm 2.3114 (3.0275) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:02:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][120/625] eta 0:05:09 lr 0.000013 wd 0.0500 time 0.6015 (0.6120) data time 0.0010 (0.0053) model time 0.6005 (0.6072) loss 6.9490 (6.4325) grad_norm 2.1942 (3.0313) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:02:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][130/625] eta 0:05:02 lr 0.000013 wd 0.0500 time 0.6039 (0.6112) data time 0.0008 (0.0050) model time 0.6031 (0.6065) loss 6.3015 (6.4312) grad_norm 1.7767 (2.9826) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:02:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][140/625] eta 0:04:56 lr 0.000013 wd 0.0500 time 0.6205 (0.6112) data time 0.0011 (0.0047) model time 0.6194 (0.6068) loss 5.8769 (6.4239) grad_norm 2.5045 (3.0454) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:02:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][150/625] eta 0:04:50 lr 0.000013 wd 0.0500 time 0.6112 (0.6111) data time 0.0008 (0.0045) model time 0.6104 (0.6071) loss 7.9064 (6.4355) grad_norm 2.0854 (3.0296) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:02:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][160/625] eta 0:04:44 lr 0.000013 wd 0.0500 time 0.6152 (0.6111) data time 0.0012 (0.0043) model time 0.6140 (0.6073) loss 6.4862 (6.4625) grad_norm 3.0858 (2.9948) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:02:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][170/625] eta 0:04:37 lr 0.000013 wd 0.0500 time 0.5992 (0.6106) data time 0.0008 (0.0041) model time 0.5984 (0.6069) loss 5.8050 (6.4695) grad_norm 1.9484 (2.9612) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:02:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][180/625] eta 0:04:31 lr 0.000013 wd 0.0500 time 0.6025 (0.6101) data time 0.0010 (0.0039) model time 0.6015 (0.6063) loss 6.0165 (6.4654) grad_norm 2.2428 (2.9485) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:03:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][190/625] eta 0:04:25 lr 0.000013 wd 0.0500 time 0.6099 (0.6096) data time 0.0008 (0.0038) model time 0.6091 (0.6059) loss 5.8181 (6.4715) grad_norm 2.3364 (2.9359) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:03:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][200/625] eta 0:04:18 lr 0.000013 wd 0.0500 time 0.6008 (0.6090) data time 0.0010 (0.0036) model time 0.5998 (0.6052) loss 5.5618 (6.4677) grad_norm 2.3868 (2.9202) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:03:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][210/625] eta 0:04:12 lr 0.000013 wd 0.0500 time 0.6269 (0.6089) data time 0.0008 (0.0035) model time 0.6261 (0.6052) loss 5.8633 (6.4783) grad_norm 3.8430 (2.9225) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:03:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][220/625] eta 0:04:06 lr 0.000013 wd 0.0500 time 0.6097 (0.6088) data time 0.0011 (0.0034) model time 0.6087 (0.6053) loss 6.5976 (6.4748) grad_norm 1.8681 (2.9104) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:03:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][230/625] eta 0:04:00 lr 0.000013 wd 0.0500 time 0.6092 (0.6088) data time 0.0008 (0.0033) model time 0.6084 (0.6055) loss 6.6597 (6.4704) grad_norm 3.1489 (2.8962) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:03:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][240/625] eta 0:03:54 lr 0.000013 wd 0.0500 time 0.6031 (0.6087) data time 0.0008 (0.0032) model time 0.6023 (0.6055) loss 6.3003 (6.4683) grad_norm 2.6466 (2.8834) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:03:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][250/625] eta 0:03:48 lr 0.000013 wd 0.0500 time 0.6015 (0.6084) data time 0.0011 (0.0031) model time 0.6004 (0.6051) loss 6.2953 (6.4716) grad_norm 2.5526 (2.8713) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:03:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][260/625] eta 0:03:41 lr 0.000013 wd 0.0500 time 0.6011 (0.6081) data time 0.0008 (0.0030) model time 0.6003 (0.6048) loss 5.4246 (6.4675) grad_norm 1.7803 (2.8721) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:03:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][270/625] eta 0:03:35 lr 0.000013 wd 0.0500 time 0.6239 (0.6079) data time 0.0011 (0.0029) model time 0.6228 (0.6048) loss 7.1920 (6.4756) grad_norm 1.9991 (2.8657) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:03:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][280/625] eta 0:03:29 lr 0.000013 wd 0.0500 time 0.6078 (0.6083) data time 0.0011 (0.0029) model time 0.6067 (0.6053) loss 7.6526 (6.4836) grad_norm 2.6041 (2.8452) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:04:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][290/625] eta 0:03:23 lr 0.000013 wd 0.0500 time 0.6106 (0.6083) data time 0.0010 (0.0028) model time 0.6096 (0.6054) loss 5.4410 (6.4726) grad_norm 2.6159 (2.8281) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:04:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][300/625] eta 0:03:17 lr 0.000013 wd 0.0500 time 0.6106 (0.6083) data time 0.0010 (0.0028) model time 0.6096 (0.6055) loss 5.6015 (6.4655) grad_norm 2.2748 (2.8307) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:04:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][310/625] eta 0:03:11 lr 0.000013 wd 0.0500 time 0.6108 (0.6084) data time 0.0011 (0.0027) model time 0.6097 (0.6057) loss 6.1999 (6.4597) grad_norm 3.0908 (2.8239) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:04:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][320/625] eta 0:03:05 lr 0.000013 wd 0.0500 time 0.6014 (0.6083) data time 0.0008 (0.0027) model time 0.6006 (0.6056) loss 6.7117 (6.4523) grad_norm 3.4143 (2.8400) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:04:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][330/625] eta 0:02:59 lr 0.000013 wd 0.0500 time 0.5987 (0.6082) data time 0.0011 (0.0026) model time 0.5976 (0.6055) loss 5.8558 (6.4590) grad_norm 2.3554 (2.8701) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:04:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][340/625] eta 0:02:53 lr 0.000013 wd 0.0500 time 0.6038 (0.6086) data time 0.0008 (0.0026) model time 0.6030 (0.6060) loss 6.2330 (6.4584) grad_norm 2.7360 (2.8678) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:04:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][350/625] eta 0:02:47 lr 0.000013 wd 0.0500 time 0.6066 (0.6085) data time 0.0010 (0.0025) model time 0.6056 (0.6059) loss 6.4144 (6.4599) grad_norm 2.2434 (2.8539) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:04:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][360/625] eta 0:02:41 lr 0.000013 wd 0.0500 time 0.6067 (0.6085) data time 0.0010 (0.0025) model time 0.6056 (0.6061) loss 6.2409 (6.4706) grad_norm 1.7554 (2.8541) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:04:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][370/625] eta 0:02:35 lr 0.000013 wd 0.0500 time 0.6475 (0.6088) data time 0.0011 (0.0025) model time 0.6464 (0.6064) loss 6.5285 (6.4631) grad_norm 2.1553 (2.8412) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:04:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][380/625] eta 0:02:29 lr 0.000013 wd 0.0500 time 0.6050 (0.6091) data time 0.0008 (0.0024) model time 0.6041 (0.6068) loss 5.4625 (6.4543) grad_norm 3.0929 (2.8333) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:05:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][390/625] eta 0:02:23 lr 0.000013 wd 0.0500 time 0.7158 (0.6093) data time 0.0008 (0.0024) model time 0.7150 (0.6070) loss 6.6479 (6.4553) grad_norm 2.8859 (2.8351) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:05:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][400/625] eta 0:02:17 lr 0.000013 wd 0.0500 time 0.6035 (0.6091) data time 0.0008 (0.0024) model time 0.6027 (0.6068) loss 5.9609 (6.4665) grad_norm 2.1718 (2.8287) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:05:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][410/625] eta 0:02:10 lr 0.000013 wd 0.0500 time 0.5999 (0.6089) data time 0.0011 (0.0024) model time 0.5988 (0.6066) loss 7.4219 (6.4754) grad_norm 2.4266 (2.8272) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:05:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][420/625] eta 0:02:04 lr 0.000013 wd 0.0500 time 0.6022 (0.6087) data time 0.0008 (0.0024) model time 0.6014 (0.6064) loss 8.0397 (6.4777) grad_norm 2.8600 (2.8282) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:05:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][430/625] eta 0:01:58 lr 0.000013 wd 0.0500 time 0.6089 (0.6087) data time 0.0008 (0.0023) model time 0.6081 (0.6064) loss 7.2125 (6.4813) grad_norm 2.1076 (2.8217) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:05:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][440/625] eta 0:01:52 lr 0.000013 wd 0.0500 time 0.6041 (0.6086) data time 0.0010 (0.0023) model time 0.6031 (0.6064) loss 6.9892 (6.4833) grad_norm 2.4826 (2.8130) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:05:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][450/625] eta 0:01:46 lr 0.000013 wd 0.0500 time 0.6069 (0.6086) data time 0.0010 (0.0023) model time 0.6058 (0.6064) loss 7.1371 (6.4833) grad_norm 2.2846 (2.8161) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:05:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][460/625] eta 0:01:40 lr 0.000013 wd 0.0500 time 0.5996 (0.6086) data time 0.0011 (0.0023) model time 0.5985 (0.6064) loss 6.7039 (6.4841) grad_norm 2.3558 (2.8550) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:05:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][470/625] eta 0:01:34 lr 0.000013 wd 0.0500 time 0.6019 (0.6085) data time 0.0008 (0.0022) model time 0.6011 (0.6063) loss 6.2387 (6.4872) grad_norm 3.5416 (2.8549) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:05:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][480/625] eta 0:01:28 lr 0.000013 wd 0.0500 time 0.5963 (0.6084) data time 0.0008 (0.0022) model time 0.5955 (0.6062) loss 6.0224 (6.4862) grad_norm 2.7562 (2.8713) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:06:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][490/625] eta 0:01:22 lr 0.000013 wd 0.0500 time 0.5951 (0.6082) data time 0.0008 (0.0022) model time 0.5943 (0.6060) loss 6.6623 (6.4809) grad_norm 2.4466 (2.8741) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:06:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][500/625] eta 0:01:16 lr 0.000013 wd 0.0500 time 0.6219 (0.6083) data time 0.0010 (0.0022) model time 0.6208 (0.6062) loss 5.6606 (6.4796) grad_norm 2.5733 (2.8712) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:06:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][510/625] eta 0:01:09 lr 0.000013 wd 0.0500 time 0.6079 (0.6084) data time 0.0010 (0.0022) model time 0.6069 (0.6063) loss 6.3286 (6.4752) grad_norm 2.8838 (2.8668) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:06:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][520/625] eta 0:01:03 lr 0.000013 wd 0.0500 time 0.6106 (0.6085) data time 0.0008 (0.0021) model time 0.6097 (0.6064) loss 5.9570 (6.4786) grad_norm 2.9941 (2.8615) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:06:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][530/625] eta 0:00:57 lr 0.000013 wd 0.0500 time 0.6027 (0.6086) data time 0.0010 (0.0021) model time 0.6017 (0.6065) loss 7.7378 (6.4798) grad_norm 2.0141 (2.8596) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:06:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][540/625] eta 0:00:51 lr 0.000013 wd 0.0500 time 0.6291 (0.6086) data time 0.0010 (0.0021) model time 0.6281 (0.6065) loss 6.1930 (6.4761) grad_norm 2.5522 (2.8600) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:06:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][550/625] eta 0:00:45 lr 0.000013 wd 0.0500 time 0.6086 (0.6085) data time 0.0008 (0.0021) model time 0.6078 (0.6065) loss 5.5138 (6.4766) grad_norm 3.5870 (2.8551) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:06:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][560/625] eta 0:00:39 lr 0.000013 wd 0.0500 time 0.5997 (0.6087) data time 0.0010 (0.0021) model time 0.5987 (0.6067) loss 6.3934 (6.4707) grad_norm 3.7094 (2.8528) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:06:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][570/625] eta 0:00:33 lr 0.000013 wd 0.0500 time 0.6097 (0.6087) data time 0.0008 (0.0021) model time 0.6089 (0.6068) loss 5.8913 (6.4694) grad_norm 3.3870 (2.8815) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:06:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][580/625] eta 0:00:27 lr 0.000013 wd 0.0500 time 0.6086 (0.6088) data time 0.0008 (0.0020) model time 0.6079 (0.6068) loss 7.1309 (6.4819) grad_norm 2.4236 (2.8742) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:07:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][590/625] eta 0:00:21 lr 0.000013 wd 0.0500 time 0.6092 (0.6088) data time 0.0008 (0.0020) model time 0.6084 (0.6069) loss 7.2937 (6.4848) grad_norm 2.6458 (2.8915) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:07:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][600/625] eta 0:00:15 lr 0.000013 wd 0.0500 time 0.6092 (0.6089) data time 0.0010 (0.0020) model time 0.6081 (0.6070) loss 6.4026 (6.4799) grad_norm 5.7589 (2.9127) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:07:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][610/625] eta 0:00:09 lr 0.000013 wd 0.0500 time 0.6063 (0.6089) data time 0.0008 (0.0020) model time 0.6055 (0.6070) loss 5.6471 (6.4820) grad_norm 2.8869 (2.9117) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:07:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [294/300][620/625] eta 0:00:03 lr 0.000013 wd 0.0500 time 0.6027 (0.6089) data time 0.0008 (0.0020) model time 0.6020 (0.6070) loss 5.5795 (6.4843) grad_norm 2.8790 (2.9112) loss_scale 512.0000 (512.0000) mem 22338MB +[2024-07-29 11:07:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 294 training takes 0:06:20 +[2024-07-29 11:07:25 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 11:07:27 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 11:07:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.521 (0.521) Loss 0.4868 (0.4868) Acc@1 90.625 (90.625) Acc@5 98.975 (98.975) Mem 22338MB +[2024-07-29 11:07:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.163) Loss 0.7432 (0.5910) Acc@1 83.057 (88.281) Acc@5 97.070 (98.158) Mem 22338MB +[2024-07-29 11:07:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.127 (0.146) Loss 0.8062 (0.6753) Acc@1 81.543 (85.749) Acc@5 96.240 (97.407) Mem 22338MB +[2024-07-29 11:07:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.387 Acc@5 97.395 +[2024-07-29 11:07:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.4% +[2024-07-29 11:07:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.796 (0.796) Loss 0.4890 (0.4890) Acc@1 90.430 (90.430) Acc@5 99.023 (99.023) Mem 22338MB +[2024-07-29 11:07:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.126 (0.188) Loss 0.7339 (0.5914) Acc@1 83.105 (88.277) Acc@5 97.266 (98.167) Mem 22338MB +[2024-07-29 11:07:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.126 (0.159) Loss 0.8096 (0.6757) Acc@1 81.348 (85.698) Acc@5 96.338 (97.428) Mem 22338MB +[2024-07-29 11:07:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.325 Acc@5 97.419 +[2024-07-29 11:07:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.3% +[2024-07-29 11:07:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.32% +[2024-07-29 11:07:34 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 11:07:39 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 11:07:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][0/625] eta 0:10:01 lr 0.000013 wd 0.0500 time 0.9618 (0.9618) data time 0.4259 (0.4259) model time 0.0000 (0.0000) loss 6.0637 (6.0637) grad_norm 3.0152 (3.0152) loss_scale 1024.0000 (1024.0000) mem 22338MB +[2024-07-29 11:07:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][10/625] eta 0:06:30 lr 0.000013 wd 0.0500 time 0.6061 (0.6356) data time 0.0010 (0.0397) model time 0.0000 (0.0000) loss 6.9446 (6.4569) grad_norm 2.6469 (2.5988) loss_scale 1024.0000 (1024.0000) mem 22338MB +[2024-07-29 11:07:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][20/625] eta 0:06:15 lr 0.000013 wd 0.0500 time 0.6044 (0.6213) data time 0.0008 (0.0213) model time 0.0000 (0.0000) loss 5.5978 (6.3757) grad_norm 3.0542 (2.7783) loss_scale 1024.0000 (1024.0000) mem 22338MB +[2024-07-29 11:07:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][30/625] eta 0:06:07 lr 0.000013 wd 0.0500 time 0.6052 (0.6176) data time 0.0012 (0.0147) model time 0.0000 (0.0000) loss 7.2638 (6.3628) grad_norm 2.3401 (2.8050) loss_scale 1024.0000 (1024.0000) mem 22338MB +[2024-07-29 11:08:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][40/625] eta 0:05:59 lr 0.000013 wd 0.0500 time 0.6043 (0.6148) data time 0.0008 (0.0114) model time 0.0000 (0.0000) loss 6.2541 (6.4485) grad_norm 2.5129 (2.9906) loss_scale 1024.0000 (1024.0000) mem 22338MB +[2024-07-29 11:08:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][50/625] eta 0:05:52 lr 0.000013 wd 0.0500 time 0.5835 (0.6128) data time 0.0008 (0.0094) model time 0.0000 (0.0000) loss 5.9359 (6.3684) grad_norm 2.9313 (2.9370) loss_scale 1024.0000 (1024.0000) mem 22338MB +[2024-07-29 11:08:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][60/625] eta 0:05:45 lr 0.000013 wd 0.0500 time 0.6018 (0.6112) data time 0.0010 (0.0080) model time 0.6007 (0.6024) loss 6.0299 (6.3684) grad_norm 2.5778 (2.8811) loss_scale 1024.0000 (1024.0000) mem 22338MB +[2024-07-29 11:08:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][70/625] eta 0:05:38 lr 0.000013 wd 0.0500 time 0.6004 (0.6097) data time 0.0007 (0.0070) model time 0.5997 (0.6010) loss 5.9471 (6.3627) grad_norm 2.5935 (2.8255) loss_scale 1024.0000 (1024.0000) mem 22338MB +[2024-07-29 11:08:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][80/625] eta 0:05:31 lr 0.000013 wd 0.0500 time 0.6062 (0.6086) data time 0.0011 (0.0063) model time 0.6052 (0.6006) loss 6.1972 (6.3863) grad_norm 2.5411 (2.7855) loss_scale 1024.0000 (1024.0000) mem 22338MB +[2024-07-29 11:08:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][90/625] eta 0:05:25 lr 0.000013 wd 0.0500 time 0.6061 (0.6084) data time 0.0008 (0.0057) model time 0.6053 (0.6019) loss 5.6400 (6.4074) grad_norm 2.1513 (2.7501) loss_scale 1024.0000 (1024.0000) mem 22338MB +[2024-07-29 11:08:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][100/625] eta 0:05:19 lr 0.000013 wd 0.0500 time 0.6045 (0.6082) data time 0.0011 (0.0052) model time 0.6034 (0.6025) loss 5.4030 (6.3740) grad_norm 2.5235 (inf) loss_scale 512.0000 (993.5842) mem 22338MB +[2024-07-29 11:08:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][110/625] eta 0:05:13 lr 0.000013 wd 0.0500 time 0.6044 (0.6080) data time 0.0008 (0.0048) model time 0.6035 (0.6029) loss 7.0265 (6.3843) grad_norm 2.7289 (inf) loss_scale 512.0000 (950.1982) mem 22338MB +[2024-07-29 11:08:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][120/625] eta 0:05:06 lr 0.000013 wd 0.0500 time 0.5995 (0.6077) data time 0.0008 (0.0045) model time 0.5987 (0.6029) loss 6.9414 (6.4002) grad_norm 2.7992 (inf) loss_scale 512.0000 (913.9835) mem 22338MB +[2024-07-29 11:08:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][130/625] eta 0:05:00 lr 0.000013 wd 0.0500 time 0.6037 (0.6072) data time 0.0008 (0.0043) model time 0.6029 (0.6027) loss 5.8472 (6.3914) grad_norm 5.8401 (inf) loss_scale 512.0000 (883.2977) mem 22338MB +[2024-07-29 11:09:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][140/625] eta 0:04:54 lr 0.000013 wd 0.0500 time 0.5946 (0.6068) data time 0.0011 (0.0040) model time 0.5935 (0.6024) loss 7.3826 (6.4199) grad_norm 2.7070 (inf) loss_scale 512.0000 (856.9645) mem 22338MB +[2024-07-29 11:09:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][150/625] eta 0:04:47 lr 0.000013 wd 0.0500 time 0.5981 (0.6063) data time 0.0007 (0.0039) model time 0.5973 (0.6019) loss 6.7888 (6.4476) grad_norm 2.7000 (inf) loss_scale 512.0000 (834.1192) mem 22338MB +[2024-07-29 11:09:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][160/625] eta 0:04:42 lr 0.000013 wd 0.0500 time 0.6106 (0.6075) data time 0.0011 (0.0037) model time 0.6095 (0.6040) loss 5.7668 (6.4307) grad_norm 3.0196 (inf) loss_scale 512.0000 (814.1118) mem 22338MB +[2024-07-29 11:09:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][170/625] eta 0:04:36 lr 0.000013 wd 0.0500 time 0.6078 (0.6075) data time 0.0011 (0.0035) model time 0.6067 (0.6043) loss 6.9183 (6.4117) grad_norm 2.2958 (inf) loss_scale 512.0000 (796.4444) mem 22338MB +[2024-07-29 11:09:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][180/625] eta 0:04:30 lr 0.000013 wd 0.0500 time 0.6056 (0.6075) data time 0.0011 (0.0034) model time 0.6045 (0.6044) loss 6.7119 (6.4137) grad_norm 2.4561 (inf) loss_scale 512.0000 (780.7293) mem 22338MB +[2024-07-29 11:09:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][190/625] eta 0:04:24 lr 0.000013 wd 0.0500 time 0.6003 (0.6074) data time 0.0008 (0.0033) model time 0.5995 (0.6044) loss 6.9229 (6.4364) grad_norm 2.0884 (inf) loss_scale 512.0000 (766.6597) mem 22338MB +[2024-07-29 11:09:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][200/625] eta 0:04:18 lr 0.000013 wd 0.0500 time 0.6027 (0.6072) data time 0.0008 (0.0032) model time 0.6019 (0.6042) loss 7.6438 (6.4464) grad_norm 4.5615 (inf) loss_scale 512.0000 (753.9900) mem 22338MB +[2024-07-29 11:09:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][210/625] eta 0:04:11 lr 0.000013 wd 0.0500 time 0.6010 (0.6069) data time 0.0010 (0.0031) model time 0.6000 (0.6040) loss 7.3205 (6.4470) grad_norm 27.7721 (inf) loss_scale 512.0000 (742.5213) mem 22338MB +[2024-07-29 11:09:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][220/625] eta 0:04:05 lr 0.000013 wd 0.0500 time 0.5973 (0.6066) data time 0.0010 (0.0030) model time 0.5963 (0.6037) loss 6.7928 (6.4491) grad_norm 2.1350 (inf) loss_scale 512.0000 (732.0905) mem 22338MB +[2024-07-29 11:09:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][230/625] eta 0:03:59 lr 0.000013 wd 0.0500 time 0.6083 (0.6066) data time 0.0009 (0.0029) model time 0.6074 (0.6038) loss 6.5428 (6.4655) grad_norm 4.8179 (inf) loss_scale 512.0000 (722.5628) mem 22338MB +[2024-07-29 11:10:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][240/625] eta 0:03:53 lr 0.000013 wd 0.0500 time 0.6037 (0.6066) data time 0.0011 (0.0028) model time 0.6026 (0.6039) loss 6.8352 (6.4636) grad_norm 2.6021 (inf) loss_scale 512.0000 (713.8257) mem 22338MB +[2024-07-29 11:10:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][250/625] eta 0:03:47 lr 0.000013 wd 0.0500 time 0.6015 (0.6072) data time 0.0008 (0.0027) model time 0.6007 (0.6048) loss 6.3911 (6.4503) grad_norm 4.9833 (inf) loss_scale 256.0000 (699.6653) mem 22338MB +[2024-07-29 11:10:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][260/625] eta 0:03:41 lr 0.000013 wd 0.0500 time 0.6022 (0.6072) data time 0.0009 (0.0027) model time 0.6013 (0.6048) loss 6.7396 (6.4606) grad_norm 2.4848 (inf) loss_scale 256.0000 (682.6667) mem 22338MB +[2024-07-29 11:10:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][270/625] eta 0:03:35 lr 0.000013 wd 0.0500 time 0.5968 (0.6070) data time 0.0011 (0.0026) model time 0.5957 (0.6046) loss 6.9787 (6.4640) grad_norm 2.8666 (inf) loss_scale 256.0000 (666.9225) mem 22338MB +[2024-07-29 11:10:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][280/625] eta 0:03:29 lr 0.000013 wd 0.0500 time 0.5915 (0.6068) data time 0.0008 (0.0026) model time 0.5907 (0.6045) loss 6.5679 (6.4641) grad_norm 5.2659 (inf) loss_scale 256.0000 (652.2989) mem 22338MB +[2024-07-29 11:10:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][290/625] eta 0:03:23 lr 0.000013 wd 0.0500 time 0.5992 (0.6066) data time 0.0011 (0.0025) model time 0.5980 (0.6043) loss 5.9440 (6.4638) grad_norm 4.6717 (inf) loss_scale 256.0000 (638.6804) mem 22338MB +[2024-07-29 11:10:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][300/625] eta 0:03:17 lr 0.000013 wd 0.0500 time 0.6059 (0.6065) data time 0.0010 (0.0025) model time 0.6048 (0.6042) loss 6.8118 (6.4741) grad_norm 2.0334 (inf) loss_scale 256.0000 (625.9668) mem 22338MB +[2024-07-29 11:10:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][310/625] eta 0:03:11 lr 0.000013 wd 0.0500 time 0.6145 (0.6065) data time 0.0008 (0.0024) model time 0.6136 (0.6042) loss 6.7543 (6.4705) grad_norm 3.2267 (inf) loss_scale 256.0000 (614.0707) mem 22338MB +[2024-07-29 11:10:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][320/625] eta 0:03:04 lr 0.000013 wd 0.0500 time 0.6071 (0.6065) data time 0.0011 (0.0024) model time 0.6060 (0.6043) loss 6.4090 (6.4742) grad_norm 3.1586 (inf) loss_scale 256.0000 (602.9159) mem 22338MB +[2024-07-29 11:11:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][330/625] eta 0:02:58 lr 0.000013 wd 0.0500 time 0.6067 (0.6066) data time 0.0011 (0.0023) model time 0.6056 (0.6044) loss 7.1069 (6.4854) grad_norm 49.3402 (inf) loss_scale 256.0000 (592.4350) mem 22338MB +[2024-07-29 11:11:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][340/625] eta 0:02:52 lr 0.000013 wd 0.0500 time 0.5992 (0.6065) data time 0.0009 (0.0023) model time 0.5983 (0.6043) loss 6.0593 (6.4892) grad_norm 3.2561 (inf) loss_scale 256.0000 (582.5689) mem 22338MB +[2024-07-29 11:11:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][350/625] eta 0:02:46 lr 0.000013 wd 0.0500 time 0.5999 (0.6064) data time 0.0008 (0.0023) model time 0.5991 (0.6042) loss 5.6599 (6.4959) grad_norm 3.5589 (inf) loss_scale 256.0000 (573.2650) mem 22338MB +[2024-07-29 11:11:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][360/625] eta 0:02:40 lr 0.000013 wd 0.0500 time 0.5997 (0.6062) data time 0.0011 (0.0022) model time 0.5985 (0.6041) loss 6.6218 (6.4934) grad_norm 1.9135 (inf) loss_scale 256.0000 (564.4765) mem 22338MB +[2024-07-29 11:11:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][370/625] eta 0:02:34 lr 0.000013 wd 0.0500 time 0.5990 (0.6061) data time 0.0011 (0.0022) model time 0.5979 (0.6040) loss 8.1012 (6.4977) grad_norm 3.7670 (inf) loss_scale 256.0000 (556.1617) mem 22338MB +[2024-07-29 11:11:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][380/625] eta 0:02:28 lr 0.000013 wd 0.0500 time 0.6079 (0.6066) data time 0.0009 (0.0022) model time 0.6070 (0.6046) loss 5.6770 (6.4971) grad_norm 3.0720 (inf) loss_scale 256.0000 (548.2835) mem 22338MB +[2024-07-29 11:11:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][390/625] eta 0:02:22 lr 0.000013 wd 0.0500 time 0.6089 (0.6067) data time 0.0008 (0.0021) model time 0.6081 (0.6047) loss 5.7184 (6.4947) grad_norm 2.1283 (inf) loss_scale 256.0000 (540.8082) mem 22338MB +[2024-07-29 11:11:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][400/625] eta 0:02:16 lr 0.000013 wd 0.0500 time 0.6065 (0.6067) data time 0.0008 (0.0021) model time 0.6057 (0.6048) loss 7.2251 (6.5057) grad_norm 2.4778 (inf) loss_scale 256.0000 (533.7057) mem 22338MB +[2024-07-29 11:11:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][410/625] eta 0:02:10 lr 0.000013 wd 0.0500 time 0.5982 (0.6067) data time 0.0010 (0.0021) model time 0.5971 (0.6047) loss 5.9000 (6.5099) grad_norm 3.3044 (inf) loss_scale 256.0000 (526.9489) mem 22338MB +[2024-07-29 11:11:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][420/625] eta 0:02:04 lr 0.000013 wd 0.0500 time 0.5969 (0.6066) data time 0.0011 (0.0021) model time 0.5958 (0.6047) loss 5.1235 (6.5017) grad_norm 2.1669 (inf) loss_scale 256.0000 (520.5131) mem 22338MB +[2024-07-29 11:12:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][430/625] eta 0:01:58 lr 0.000013 wd 0.0500 time 0.5970 (0.6065) data time 0.0010 (0.0020) model time 0.5960 (0.6046) loss 6.9022 (6.5007) grad_norm 2.1244 (inf) loss_scale 256.0000 (514.3759) mem 22338MB +[2024-07-29 11:12:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][440/625] eta 0:01:52 lr 0.000013 wd 0.0500 time 0.5967 (0.6063) data time 0.0010 (0.0020) model time 0.5957 (0.6044) loss 8.5473 (6.5046) grad_norm 2.0820 (inf) loss_scale 256.0000 (508.5170) mem 22338MB +[2024-07-29 11:12:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][450/625] eta 0:01:46 lr 0.000013 wd 0.0500 time 0.6069 (0.6063) data time 0.0008 (0.0020) model time 0.6061 (0.6044) loss 5.7944 (6.5074) grad_norm 3.2520 (inf) loss_scale 256.0000 (502.9180) mem 22338MB +[2024-07-29 11:12:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][460/625] eta 0:01:40 lr 0.000013 wd 0.0500 time 0.6088 (0.6064) data time 0.0008 (0.0020) model time 0.6081 (0.6045) loss 6.7559 (6.5068) grad_norm 2.9161 (inf) loss_scale 256.0000 (497.5618) mem 22338MB +[2024-07-29 11:12:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][470/625] eta 0:01:34 lr 0.000013 wd 0.0500 time 0.7231 (0.6067) data time 0.0011 (0.0019) model time 0.7220 (0.6049) loss 6.4202 (6.5024) grad_norm 2.6586 (inf) loss_scale 256.0000 (492.4331) mem 22338MB +[2024-07-29 11:12:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][480/625] eta 0:01:27 lr 0.000013 wd 0.0500 time 0.6081 (0.6067) data time 0.0008 (0.0019) model time 0.6073 (0.6050) loss 6.6706 (6.5061) grad_norm 2.2343 (inf) loss_scale 256.0000 (487.5177) mem 22338MB +[2024-07-29 11:12:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][490/625] eta 0:01:21 lr 0.000013 wd 0.0500 time 0.5976 (0.6066) data time 0.0011 (0.0019) model time 0.5966 (0.6048) loss 5.6750 (6.5061) grad_norm 2.9419 (inf) loss_scale 256.0000 (482.8024) mem 22338MB +[2024-07-29 11:12:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][500/625] eta 0:01:15 lr 0.000013 wd 0.0500 time 0.6020 (0.6065) data time 0.0011 (0.0019) model time 0.6009 (0.6047) loss 7.3625 (6.5118) grad_norm 2.3962 (inf) loss_scale 256.0000 (478.2754) mem 22338MB +[2024-07-29 11:12:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][510/625] eta 0:01:09 lr 0.000013 wd 0.0500 time 0.6044 (0.6064) data time 0.0011 (0.0019) model time 0.6033 (0.6047) loss 6.4066 (6.5120) grad_norm 2.2578 (inf) loss_scale 256.0000 (473.9256) mem 22338MB +[2024-07-29 11:12:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][520/625] eta 0:01:03 lr 0.000013 wd 0.0500 time 0.6078 (0.6063) data time 0.0009 (0.0019) model time 0.6070 (0.6046) loss 5.6042 (6.5119) grad_norm 2.3424 (inf) loss_scale 256.0000 (469.7428) mem 22338MB +[2024-07-29 11:13:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][530/625] eta 0:00:57 lr 0.000013 wd 0.0500 time 0.6081 (0.6064) data time 0.0011 (0.0018) model time 0.6070 (0.6046) loss 6.7568 (6.5077) grad_norm 2.8149 (inf) loss_scale 256.0000 (465.7175) mem 22338MB +[2024-07-29 11:13:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][540/625] eta 0:00:51 lr 0.000013 wd 0.0500 time 0.6095 (0.6064) data time 0.0009 (0.0018) model time 0.6086 (0.6047) loss 6.4741 (6.4996) grad_norm 2.8335 (inf) loss_scale 256.0000 (461.8410) mem 22338MB +[2024-07-29 11:13:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][550/625] eta 0:00:45 lr 0.000013 wd 0.0500 time 0.5916 (0.6064) data time 0.0009 (0.0018) model time 0.5908 (0.6047) loss 5.7563 (6.5012) grad_norm 3.6674 (inf) loss_scale 256.0000 (458.1053) mem 22338MB +[2024-07-29 11:13:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 11:13:14 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 11:13:19 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 11:14:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 11:14:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 11:15:20 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 11:15:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 11:15:30 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 11:15:31 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 11:15:31 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 11:15:31 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 295) +[2024-07-29 11:15:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 11:15:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][560/625] eta 0:01:45 lr 0.000013 wd 0.0500 time 0.5738 (1.6296) data time 0.0006 (0.0858) model time 0.5732 (1.5438) loss 6.7658 (6.6770) grad_norm 3.0389 (2.4958) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:15:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 11:15:51 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 11:15:57 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 11:17:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 11:17:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 11:18:32 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 11:18:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 11:18:51 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 11:18:52 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 11:18:52 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 11:18:52 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 295) +[2024-07-29 11:18:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 11:19:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][570/625] eta 0:02:04 lr 0.000013 wd 0.0500 time 0.5166 (2.2574) data time 0.0018 (0.1174) model time 0.5148 (2.1400) loss 6.5170 (6.8970) grad_norm 2.3351 (2.8691) loss_scale 256.0000 (256.0000) mem 22346MB +[2024-07-29 11:19:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][580/625] eta 0:00:55 lr 0.000013 wd 0.0500 time 0.5161 (1.2355) data time 0.0011 (0.0490) model time 0.5151 (1.1866) loss 6.6050 (6.7999) grad_norm 2.8093 (2.7162) loss_scale 256.0000 (256.0000) mem 22346MB +[2024-07-29 11:19:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][590/625] eta 0:00:33 lr 0.000013 wd 0.0500 time 0.5166 (0.9705) data time 0.0008 (0.0316) model time 0.5158 (0.9390) loss 7.1667 (6.8364) grad_norm 3.3831 (2.8911) loss_scale 256.0000 (256.0000) mem 22346MB +[2024-07-29 11:19:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][600/625] eta 0:00:21 lr 0.000013 wd 0.0500 time 0.5175 (0.8486) data time 0.0011 (0.0233) model time 0.5165 (0.8253) loss 5.8170 (6.8219) grad_norm 1.8849 (2.9420) loss_scale 256.0000 (256.0000) mem 22346MB +[2024-07-29 11:19:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][610/625] eta 0:00:11 lr 0.000013 wd 0.0500 time 0.5328 (0.7787) data time 0.0006 (0.0186) model time 0.5322 (0.7601) loss 6.3308 (6.7728) grad_norm 3.6899 (3.7746) loss_scale 256.0000 (256.0000) mem 22346MB +[2024-07-29 11:19:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [295/300][620/625] eta 0:00:03 lr 0.000013 wd 0.0500 time 0.5146 (0.7408) data time 0.0008 (0.0156) model time 0.5138 (0.7252) loss 7.3338 (6.7393) grad_norm 3.8828 (3.8471) loss_scale 256.0000 (256.0000) mem 22346MB +[2024-07-29 11:19:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 295 training takes 0:00:44 +[2024-07-29 11:19:42 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 11:19:45 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 11:19:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.589 (0.589) Loss 0.4878 (0.4878) Acc@1 90.527 (90.527) Acc@5 98.975 (98.975) Mem 22346MB +[2024-07-29 11:19:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.161) Loss 0.7393 (0.5898) Acc@1 83.008 (88.290) Acc@5 97.266 (98.176) Mem 22346MB +[2024-07-29 11:19:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.140) Loss 0.8096 (0.6751) Acc@1 81.348 (85.742) Acc@5 96.436 (97.445) Mem 22346MB +[2024-07-29 11:19:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.385 Acc@5 97.419 +[2024-07-29 11:19:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.4% +[2024-07-29 11:19:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 1.003 (1.003) Loss 0.4885 (0.4885) Acc@1 90.479 (90.479) Acc@5 99.023 (99.023) Mem 22346MB +[2024-07-29 11:19:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.200) Loss 0.7339 (0.5911) Acc@1 83.105 (88.286) Acc@5 97.217 (98.167) Mem 22346MB +[2024-07-29 11:19:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.160) Loss 0.8091 (0.6754) Acc@1 81.348 (85.707) Acc@5 96.338 (97.428) Mem 22346MB +[2024-07-29 11:19:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.333 Acc@5 97.419 +[2024-07-29 11:19:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.3% +[2024-07-29 11:19:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.33% +[2024-07-29 11:19:55 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 11:20:01 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 11:20:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][0/625] eta 0:14:14 lr 0.000013 wd 0.0500 time 1.3665 (1.3665) data time 0.4565 (0.4565) model time 0.0000 (0.0000) loss 6.1824 (6.1824) grad_norm 2.4190 (2.4190) loss_scale 256.0000 (256.0000) mem 22337MB +[2024-07-29 11:20:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][10/625] eta 0:06:06 lr 0.000013 wd 0.0500 time 0.5200 (0.5963) data time 0.0011 (0.0424) model time 0.0000 (0.0000) loss 7.2476 (6.3442) grad_norm 2.2719 (5.0611) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:20:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][20/625] eta 0:05:38 lr 0.000013 wd 0.0500 time 0.5173 (0.5592) data time 0.0010 (0.0228) model time 0.0000 (0.0000) loss 6.7843 (6.4774) grad_norm 3.0687 (4.1530) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:20:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][30/625] eta 0:05:24 lr 0.000013 wd 0.0500 time 0.5169 (0.5456) data time 0.0009 (0.0157) model time 0.0000 (0.0000) loss 5.7186 (6.5171) grad_norm 2.5855 (3.8207) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:20:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][40/625] eta 0:05:15 lr 0.000013 wd 0.0500 time 0.5140 (0.5394) data time 0.0009 (0.0122) model time 0.0000 (0.0000) loss 7.1510 (6.6159) grad_norm 2.0074 (3.4845) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:20:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][50/625] eta 0:05:07 lr 0.000013 wd 0.0500 time 0.5161 (0.5352) data time 0.0010 (0.0100) model time 0.0000 (0.0000) loss 6.9568 (6.5699) grad_norm 4.4869 (3.3433) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:20:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][60/625] eta 0:05:00 lr 0.000013 wd 0.0500 time 0.5167 (0.5326) data time 0.0010 (0.0085) model time 0.5157 (0.5181) loss 7.3892 (6.5851) grad_norm 3.0253 (3.4988) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:20:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][70/625] eta 0:04:54 lr 0.000013 wd 0.0500 time 0.5217 (0.5306) data time 0.0010 (0.0075) model time 0.5207 (0.5178) loss 6.5059 (6.5808) grad_norm 3.3590 (3.4133) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:20:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][80/625] eta 0:04:48 lr 0.000013 wd 0.0500 time 0.5185 (0.5292) data time 0.0010 (0.0067) model time 0.5175 (0.5179) loss 6.9183 (6.5903) grad_norm 5.5304 (3.4750) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:20:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][90/625] eta 0:04:42 lr 0.000013 wd 0.0500 time 0.5178 (0.5281) data time 0.0011 (0.0061) model time 0.5168 (0.5181) loss 7.4063 (6.5629) grad_norm 2.4711 (3.4355) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:20:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][100/625] eta 0:04:36 lr 0.000013 wd 0.0500 time 0.5155 (0.5271) data time 0.0011 (0.0056) model time 0.5144 (0.5179) loss 6.7329 (6.5635) grad_norm 1.9414 (3.3692) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:20:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][110/625] eta 0:04:31 lr 0.000013 wd 0.0500 time 0.5214 (0.5263) data time 0.0011 (0.0052) model time 0.5204 (0.5177) loss 6.0495 (6.5571) grad_norm 2.8857 (3.3341) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:21:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][120/625] eta 0:04:25 lr 0.000013 wd 0.0500 time 0.5164 (0.5259) data time 0.0011 (0.0049) model time 0.5153 (0.5180) loss 7.6572 (6.5634) grad_norm 2.3665 (3.2941) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:21:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][130/625] eta 0:04:20 lr 0.000013 wd 0.0500 time 0.5230 (0.5253) data time 0.0010 (0.0046) model time 0.5220 (0.5178) loss 6.0421 (6.5625) grad_norm 3.0170 (3.2835) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:21:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][140/625] eta 0:04:14 lr 0.000013 wd 0.0500 time 0.5182 (0.5248) data time 0.0008 (0.0043) model time 0.5174 (0.5178) loss 7.1303 (6.5365) grad_norm 2.3850 (3.2915) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:21:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][150/625] eta 0:04:09 lr 0.000013 wd 0.0500 time 0.5212 (0.5255) data time 0.0011 (0.0041) model time 0.5201 (0.5194) loss 6.3986 (6.5223) grad_norm 4.1808 (3.2767) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:21:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][160/625] eta 0:04:04 lr 0.000013 wd 0.0500 time 0.5179 (0.5250) data time 0.0007 (0.0039) model time 0.5171 (0.5191) loss 6.6358 (6.5216) grad_norm 2.7440 (3.2449) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:21:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][170/625] eta 0:03:58 lr 0.000013 wd 0.0500 time 0.5190 (0.5247) data time 0.0009 (0.0037) model time 0.5181 (0.5191) loss 6.2185 (6.5167) grad_norm 2.5380 (3.2300) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:21:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][180/625] eta 0:03:53 lr 0.000013 wd 0.0500 time 0.5165 (0.5243) data time 0.0009 (0.0036) model time 0.5156 (0.5189) loss 7.2272 (6.5070) grad_norm 2.0239 (3.1896) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:21:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][190/625] eta 0:03:47 lr 0.000013 wd 0.0500 time 0.5339 (0.5241) data time 0.0012 (0.0035) model time 0.5327 (0.5190) loss 6.8444 (6.4848) grad_norm 2.3097 (3.1881) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:21:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][200/625] eta 0:03:42 lr 0.000013 wd 0.0500 time 0.5324 (0.5241) data time 0.0014 (0.0034) model time 0.5310 (0.5192) loss 5.8897 (6.4757) grad_norm 5.3561 (3.1777) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:21:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][210/625] eta 0:03:37 lr 0.000013 wd 0.0500 time 0.5165 (0.5238) data time 0.0011 (0.0032) model time 0.5153 (0.5191) loss 5.9261 (6.4603) grad_norm 2.1206 (3.1653) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:21:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][220/625] eta 0:03:32 lr 0.000012 wd 0.0500 time 0.5174 (0.5242) data time 0.0012 (0.0031) model time 0.5163 (0.5199) loss 5.8910 (6.4738) grad_norm 3.8829 (3.1478) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:22:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][230/625] eta 0:03:27 lr 0.000012 wd 0.0500 time 0.5230 (0.5241) data time 0.0012 (0.0031) model time 0.5219 (0.5198) loss 6.1560 (6.4717) grad_norm 2.5226 (3.1987) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:22:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][240/625] eta 0:03:21 lr 0.000012 wd 0.0500 time 0.5191 (0.5238) data time 0.0010 (0.0030) model time 0.5181 (0.5197) loss 6.3351 (6.4519) grad_norm 1.9580 (3.1870) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:22:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][250/625] eta 0:03:16 lr 0.000012 wd 0.0500 time 0.5174 (0.5236) data time 0.0012 (0.0029) model time 0.5161 (0.5196) loss 6.9784 (6.4559) grad_norm 3.0974 (3.2013) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:22:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][260/625] eta 0:03:11 lr 0.000012 wd 0.0500 time 0.5193 (0.5234) data time 0.0011 (0.0028) model time 0.5182 (0.5194) loss 6.7519 (6.4764) grad_norm 2.8759 (3.1855) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:22:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][270/625] eta 0:03:05 lr 0.000012 wd 0.0500 time 0.5274 (0.5233) data time 0.0010 (0.0028) model time 0.5264 (0.5195) loss 7.2148 (6.4804) grad_norm 4.1059 (3.1710) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:22:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][280/625] eta 0:03:00 lr 0.000012 wd 0.0500 time 0.5172 (0.5231) data time 0.0008 (0.0027) model time 0.5164 (0.5193) loss 6.0651 (6.4961) grad_norm 8.7243 (3.1836) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:22:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][290/625] eta 0:02:55 lr 0.000012 wd 0.0500 time 0.5183 (0.5229) data time 0.0009 (0.0026) model time 0.5173 (0.5192) loss 7.6100 (6.4938) grad_norm 2.4969 (3.1914) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:22:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][300/625] eta 0:02:49 lr 0.000012 wd 0.0500 time 0.5175 (0.5228) data time 0.0009 (0.0026) model time 0.5166 (0.5191) loss 7.5610 (6.5001) grad_norm 2.0849 (3.1784) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:22:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][310/625] eta 0:02:44 lr 0.000012 wd 0.0500 time 0.5306 (0.5227) data time 0.0012 (0.0025) model time 0.5294 (0.5191) loss 7.0182 (6.5015) grad_norm 2.7058 (3.1660) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:22:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][320/625] eta 0:02:39 lr 0.000012 wd 0.0500 time 0.5182 (0.5227) data time 0.0009 (0.0025) model time 0.5173 (0.5192) loss 7.0850 (6.4929) grad_norm 1.9636 (3.1666) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:22:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][330/625] eta 0:02:34 lr 0.000012 wd 0.0500 time 0.5278 (0.5226) data time 0.0008 (0.0025) model time 0.5271 (0.5193) loss 6.2827 (6.4792) grad_norm 2.1847 (3.1455) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:22:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][340/625] eta 0:02:28 lr 0.000012 wd 0.0500 time 0.5171 (0.5227) data time 0.0009 (0.0024) model time 0.5162 (0.5194) loss 6.6239 (6.4816) grad_norm 2.9303 (3.1393) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:23:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][350/625] eta 0:02:23 lr 0.000012 wd 0.0500 time 0.5270 (0.5228) data time 0.0015 (0.0024) model time 0.5255 (0.5195) loss 6.5453 (6.4759) grad_norm 2.8044 (3.1360) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:23:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][360/625] eta 0:02:18 lr 0.000012 wd 0.0500 time 0.5193 (0.5228) data time 0.0009 (0.0024) model time 0.5183 (0.5197) loss 5.8545 (6.4757) grad_norm 3.5695 (3.1244) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:23:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][370/625] eta 0:02:13 lr 0.000012 wd 0.0500 time 0.5156 (0.5235) data time 0.0008 (0.0024) model time 0.5148 (0.5204) loss 6.3165 (6.4778) grad_norm 2.1158 (3.1405) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:23:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][380/625] eta 0:02:08 lr 0.000012 wd 0.0500 time 0.5416 (0.5235) data time 0.0009 (0.0024) model time 0.5408 (0.5205) loss 5.8259 (6.4795) grad_norm 2.8972 (3.1565) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:23:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][390/625] eta 0:02:03 lr 0.000012 wd 0.0500 time 0.5165 (0.5235) data time 0.0011 (0.0024) model time 0.5154 (0.5205) loss 5.5424 (6.4759) grad_norm 4.0502 (3.1592) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:23:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][400/625] eta 0:01:57 lr 0.000012 wd 0.0500 time 0.5197 (0.5235) data time 0.0009 (0.0023) model time 0.5189 (0.5206) loss 5.7304 (6.4690) grad_norm 4.0688 (3.1498) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:23:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][410/625] eta 0:01:52 lr 0.000012 wd 0.0500 time 0.5148 (0.5234) data time 0.0012 (0.0023) model time 0.5135 (0.5205) loss 6.6624 (6.4639) grad_norm 2.0733 (3.1355) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:23:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][420/625] eta 0:01:47 lr 0.000012 wd 0.0500 time 0.5222 (0.5234) data time 0.0008 (0.0023) model time 0.5214 (0.5205) loss 7.1108 (6.4599) grad_norm 2.3774 (3.1352) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:23:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][430/625] eta 0:01:42 lr 0.000012 wd 0.0500 time 0.5215 (0.5233) data time 0.0011 (0.0023) model time 0.5204 (0.5205) loss 7.2505 (6.4654) grad_norm 2.4315 (3.1268) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:23:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][440/625] eta 0:01:36 lr 0.000012 wd 0.0500 time 0.5241 (0.5236) data time 0.0010 (0.0022) model time 0.5230 (0.5208) loss 7.2152 (6.4639) grad_norm 2.7625 (3.1165) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:23:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][450/625] eta 0:01:31 lr 0.000012 wd 0.0500 time 0.5178 (0.5235) data time 0.0008 (0.0022) model time 0.5170 (0.5208) loss 6.8337 (6.4741) grad_norm 2.2968 (3.1146) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:24:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][460/625] eta 0:01:26 lr 0.000012 wd 0.0500 time 0.5249 (0.5234) data time 0.0008 (0.0022) model time 0.5240 (0.5207) loss 6.0072 (6.4696) grad_norm 3.1137 (3.1163) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:24:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][470/625] eta 0:01:21 lr 0.000012 wd 0.0500 time 0.5171 (0.5233) data time 0.0011 (0.0022) model time 0.5160 (0.5206) loss 6.2583 (6.4682) grad_norm 2.3816 (3.1080) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:24:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][480/625] eta 0:01:15 lr 0.000012 wd 0.0500 time 0.5163 (0.5232) data time 0.0008 (0.0021) model time 0.5155 (0.5205) loss 5.3634 (6.4653) grad_norm 3.3726 (3.1034) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:24:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][490/625] eta 0:01:10 lr 0.000012 wd 0.0500 time 0.5246 (0.5231) data time 0.0010 (0.0021) model time 0.5236 (0.5205) loss 6.4933 (6.4621) grad_norm 5.6410 (3.1099) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:24:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][500/625] eta 0:01:05 lr 0.000012 wd 0.0500 time 0.5175 (0.5230) data time 0.0009 (0.0021) model time 0.5166 (0.5204) loss 7.4782 (6.4701) grad_norm 2.6605 (3.1186) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:24:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][510/625] eta 0:01:00 lr 0.000012 wd 0.0500 time 0.5200 (0.5229) data time 0.0011 (0.0021) model time 0.5189 (0.5203) loss 5.9553 (6.4699) grad_norm 2.3769 (3.1197) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:24:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][520/625] eta 0:00:54 lr 0.000012 wd 0.0500 time 0.5218 (0.5228) data time 0.0008 (0.0021) model time 0.5210 (0.5203) loss 6.1312 (6.4706) grad_norm 2.6439 (3.1405) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:24:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][530/625] eta 0:00:49 lr 0.000012 wd 0.0500 time 0.5178 (0.5228) data time 0.0010 (0.0021) model time 0.5167 (0.5202) loss 7.0877 (6.4760) grad_norm 4.1415 (3.1400) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:24:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][540/625] eta 0:00:44 lr 0.000012 wd 0.0500 time 0.5232 (0.5227) data time 0.0008 (0.0020) model time 0.5224 (0.5202) loss 6.1965 (6.4733) grad_norm 15.2543 (3.1595) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:24:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][550/625] eta 0:00:39 lr 0.000012 wd 0.0500 time 0.5134 (0.5226) data time 0.0012 (0.0020) model time 0.5122 (0.5201) loss 5.9328 (6.4734) grad_norm 2.0491 (3.1524) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:24:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][560/625] eta 0:00:33 lr 0.000012 wd 0.0500 time 0.5322 (0.5226) data time 0.0012 (0.0020) model time 0.5310 (0.5201) loss 6.2067 (6.4761) grad_norm 3.0533 (3.1682) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:24:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][570/625] eta 0:00:28 lr 0.000012 wd 0.0500 time 0.5174 (0.5225) data time 0.0012 (0.0020) model time 0.5163 (0.5201) loss 7.3022 (6.4761) grad_norm 2.0635 (3.1587) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:25:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][580/625] eta 0:00:23 lr 0.000012 wd 0.0500 time 0.5172 (0.5225) data time 0.0010 (0.0020) model time 0.5162 (0.5200) loss 5.7013 (6.4710) grad_norm 3.0207 (3.1546) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:25:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][590/625] eta 0:00:18 lr 0.000012 wd 0.0500 time 0.5185 (0.5227) data time 0.0010 (0.0020) model time 0.5175 (0.5203) loss 5.4124 (6.4722) grad_norm 3.7588 (3.1498) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:25:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][600/625] eta 0:00:13 lr 0.000012 wd 0.0500 time 0.5302 (0.5226) data time 0.0011 (0.0019) model time 0.5291 (0.5203) loss 6.7146 (6.4648) grad_norm 2.3467 (3.1542) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:25:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][610/625] eta 0:00:07 lr 0.000012 wd 0.0500 time 0.5198 (0.5226) data time 0.0008 (0.0019) model time 0.5190 (0.5202) loss 6.2851 (6.4660) grad_norm 2.7220 (3.1443) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:25:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [296/300][620/625] eta 0:00:02 lr 0.000012 wd 0.0500 time 0.5154 (0.5225) data time 0.0008 (0.0019) model time 0.5146 (0.5202) loss 7.3918 (6.4691) grad_norm 4.1318 (3.1430) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:25:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 296 training takes 0:05:26 +[2024-07-29 11:25:27 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 11:25:30 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 11:25:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.574 (0.574) Loss 0.4844 (0.4844) Acc@1 90.576 (90.576) Acc@5 99.023 (99.023) Mem 22338MB +[2024-07-29 11:25:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.159) Loss 0.7339 (0.5859) Acc@1 83.252 (88.272) Acc@5 97.266 (98.162) Mem 22338MB +[2024-07-29 11:25:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.139) Loss 0.8066 (0.6707) Acc@1 81.445 (85.724) Acc@5 96.289 (97.417) Mem 22338MB +[2024-07-29 11:25:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.381 Acc@5 97.405 +[2024-07-29 11:25:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.4% +[2024-07-29 11:25:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 1.082 (1.082) Loss 0.4885 (0.4885) Acc@1 90.527 (90.527) Acc@5 99.023 (99.023) Mem 22338MB +[2024-07-29 11:25:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.205) Loss 0.7334 (0.5908) Acc@1 83.105 (88.290) Acc@5 97.217 (98.171) Mem 22338MB +[2024-07-29 11:25:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.163) Loss 0.8091 (0.6750) Acc@1 81.348 (85.703) Acc@5 96.338 (97.426) Mem 22338MB +[2024-07-29 11:25:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.341 Acc@5 97.415 +[2024-07-29 11:25:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.3% +[2024-07-29 11:25:38 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.34% +[2024-07-29 11:25:38 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 11:25:40 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 11:25:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][0/625] eta 0:10:09 lr 0.000012 wd 0.0500 time 0.9751 (0.9751) data time 0.4543 (0.4543) model time 0.0000 (0.0000) loss 8.0306 (8.0306) grad_norm 2.7371 (2.7371) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:25:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][10/625] eta 0:05:45 lr 0.000012 wd 0.0500 time 0.5144 (0.5614) data time 0.0010 (0.0424) model time 0.0000 (0.0000) loss 5.1860 (6.2781) grad_norm 2.4088 (2.8891) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:25:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][20/625] eta 0:05:28 lr 0.000012 wd 0.0500 time 0.5165 (0.5422) data time 0.0011 (0.0227) model time 0.0000 (0.0000) loss 6.0478 (6.4359) grad_norm 2.5421 (2.9850) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:25:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][30/625] eta 0:05:22 lr 0.000012 wd 0.0500 time 0.7164 (0.5416) data time 0.0010 (0.0157) model time 0.0000 (0.0000) loss 4.7386 (6.3677) grad_norm 3.5339 (3.0727) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:26:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][40/625] eta 0:05:13 lr 0.000012 wd 0.0500 time 0.5249 (0.5365) data time 0.0008 (0.0121) model time 0.0000 (0.0000) loss 5.5313 (6.3572) grad_norm 2.2122 (3.2455) loss_scale 256.0000 (256.0000) mem 22338MB +[2024-07-29 11:26:04 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 11:26:04 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 11:26:06 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 11:27:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 11:27:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 11:28:42 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 11:28:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 11:28:54 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 11:28:54 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 11:28:54 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 11:28:54 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 297) +[2024-07-29 11:28:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 11:29:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][50/625] eta 0:26:57 lr 0.000012 wd 0.0500 time 0.5241 (2.8128) data time 0.0007 (0.1306) model time 0.0000 (0.0000) loss 6.9019 (7.2339) grad_norm 2.3155 (2.5875) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:29:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][60/625] eta 0:13:02 lr 0.000012 wd 0.0500 time 0.5486 (1.3851) data time 0.0009 (0.0498) model time 0.5477 (0.5272) loss 7.4722 (6.8556) grad_norm 2.0020 (2.5976) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:29:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][70/625] eta 0:09:44 lr 0.000012 wd 0.0500 time 0.5179 (1.0537) data time 0.0007 (0.0312) model time 0.5172 (0.5245) loss 6.4919 (6.7692) grad_norm 2.5775 (2.7090) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:29:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][80/625] eta 0:08:13 lr 0.000012 wd 0.0500 time 0.5211 (0.9048) data time 0.0009 (0.0228) model time 0.5202 (0.5220) loss 6.4390 (6.6787) grad_norm 2.5100 (2.6575) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:29:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][90/625] eta 0:07:19 lr 0.000012 wd 0.0500 time 0.5170 (0.8208) data time 0.0009 (0.0181) model time 0.5161 (0.5208) loss 5.9387 (6.5726) grad_norm 2.0502 (2.6480) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:29:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][100/625] eta 0:06:46 lr 0.000012 wd 0.0500 time 0.5189 (0.7749) data time 0.0007 (0.0150) model time 0.5182 (0.5292) loss 7.7941 (6.5516) grad_norm 2.2554 (2.7067) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:29:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][110/625] eta 0:06:19 lr 0.000012 wd 0.0500 time 0.5219 (0.7365) data time 0.0011 (0.0129) model time 0.5208 (0.5277) loss 6.8562 (6.5406) grad_norm 4.0777 (2.7609) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:29:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][120/625] eta 0:05:57 lr 0.000012 wd 0.0500 time 0.5199 (0.7078) data time 0.0009 (0.0113) model time 0.5190 (0.5262) loss 7.9529 (6.5590) grad_norm 2.1204 (2.8103) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:29:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][130/625] eta 0:05:39 lr 0.000012 wd 0.0500 time 0.5225 (0.6860) data time 0.0007 (0.0101) model time 0.5217 (0.5254) loss 5.6943 (6.5294) grad_norm 3.3503 (2.9211) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:30:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][140/625] eta 0:05:24 lr 0.000012 wd 0.0500 time 0.5296 (0.6688) data time 0.0007 (0.0092) model time 0.5289 (0.5248) loss 6.4034 (6.5026) grad_norm 3.8296 (2.9370) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:30:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][150/625] eta 0:05:10 lr 0.000012 wd 0.0500 time 0.5201 (0.6546) data time 0.0008 (0.0084) model time 0.5192 (0.5240) loss 7.1914 (6.5391) grad_norm 2.4998 (3.1502) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:30:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 11:30:10 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 11:30:16 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 11:35:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 11:35:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 11:35:58 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 11:40:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 11:40:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 11:42:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 11:42:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 11:42:29 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 11:44:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 11:44:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 11:46:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 11:46:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 11:47:20 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 11:49:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 11:49:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 11:49:55 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 11:50:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 11:50:08 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 11:50:08 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 11:50:08 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 11:50:08 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 297) +[2024-07-29 11:50:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 11:55:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 11:55:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 11:56:17 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 11:56:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 11:56:27 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 11:56:27 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 11:56:28 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 11:56:28 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 297) +[2024-07-29 11:56:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 11:56:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][160/625] eta 0:16:12 lr 0.000012 wd 0.0500 time 0.5698 (2.0913) data time 0.0006 (0.1223) model time 0.5691 (1.9690) loss 6.7223 (6.7161) grad_norm 3.1520 (3.7048) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:56:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][170/625] eta 0:08:38 lr 0.000012 wd 0.0500 time 0.5687 (1.1389) data time 0.0009 (0.0464) model time 0.5678 (1.0925) loss 7.4582 (6.7577) grad_norm 3.9457 (3.0384) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:56:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][180/625] eta 0:06:49 lr 0.000012 wd 0.0500 time 0.5699 (0.9195) data time 0.0006 (0.0289) model time 0.5693 (0.8906) loss 6.0396 (6.6833) grad_norm 3.3563 (3.1203) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:57:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][190/625] eta 0:05:57 lr 0.000012 wd 0.0500 time 0.5853 (0.8225) data time 0.0008 (0.0211) model time 0.5845 (0.8014) loss 5.9511 (6.6227) grad_norm 3.7397 (3.0337) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:57:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][200/625] eta 0:05:26 lr 0.000012 wd 0.0500 time 0.5702 (0.7678) data time 0.0009 (0.0167) model time 0.5694 (0.7511) loss 6.4085 (6.6479) grad_norm 2.4775 (3.0209) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:57:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][210/625] eta 0:05:07 lr 0.000012 wd 0.0500 time 0.5700 (0.7401) data time 0.0007 (0.0139) model time 0.5694 (0.7262) loss 6.7113 (6.5977) grad_norm 5.6164 (2.9986) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:57:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][220/625] eta 0:04:49 lr 0.000012 wd 0.0500 time 0.5717 (0.7147) data time 0.0007 (0.0119) model time 0.5710 (0.7028) loss 6.1739 (6.5461) grad_norm 2.6709 (2.9907) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:57:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][230/625] eta 0:04:34 lr 0.000012 wd 0.0500 time 0.5716 (0.6962) data time 0.0009 (0.0105) model time 0.5707 (0.6857) loss 6.8748 (6.5029) grad_norm 2.6600 (3.0099) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:57:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][240/625] eta 0:04:22 lr 0.000012 wd 0.0500 time 0.5732 (0.6819) data time 0.0006 (0.0093) model time 0.5726 (0.6726) loss 5.7407 (6.4758) grad_norm 2.7036 (2.9854) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:57:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][250/625] eta 0:04:11 lr 0.000012 wd 0.0500 time 0.5717 (0.6705) data time 0.0006 (0.0085) model time 0.5711 (0.6620) loss 5.7188 (6.4709) grad_norm 2.1578 (2.9643) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:57:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][260/625] eta 0:04:01 lr 0.000012 wd 0.0500 time 0.5711 (0.6611) data time 0.0008 (0.0077) model time 0.5703 (0.6533) loss 6.7446 (6.5031) grad_norm 3.7337 (2.9896) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:57:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][270/625] eta 0:03:51 lr 0.000012 wd 0.0500 time 0.5718 (0.6535) data time 0.0006 (0.0071) model time 0.5712 (0.6463) loss 6.4952 (6.4858) grad_norm 3.0152 (3.0031) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:57:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][280/625] eta 0:03:43 lr 0.000012 wd 0.0500 time 0.5735 (0.6472) data time 0.0006 (0.0066) model time 0.5729 (0.6406) loss 4.9006 (6.4645) grad_norm 8.2566 (3.0467) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:57:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][290/625] eta 0:03:35 lr 0.000012 wd 0.0500 time 0.5765 (0.6418) data time 0.0008 (0.0062) model time 0.5757 (0.6356) loss 4.8017 (6.4625) grad_norm 3.1405 (3.2235) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:58:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][300/625] eta 0:03:27 lr 0.000012 wd 0.0500 time 0.5746 (0.6373) data time 0.0007 (0.0058) model time 0.5739 (0.6315) loss 5.7430 (6.4494) grad_norm 2.9329 (3.1897) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:58:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][310/625] eta 0:03:19 lr 0.000012 wd 0.0500 time 0.5780 (0.6334) data time 0.0008 (0.0055) model time 0.5772 (0.6279) loss 6.5173 (6.4516) grad_norm 2.5221 (3.1519) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:58:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][320/625] eta 0:03:12 lr 0.000012 wd 0.0500 time 0.5793 (0.6300) data time 0.0008 (0.0052) model time 0.5785 (0.6247) loss 7.2198 (6.4500) grad_norm 2.5856 (3.1289) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:58:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][330/625] eta 0:03:04 lr 0.000012 wd 0.0500 time 0.5763 (0.6268) data time 0.0006 (0.0050) model time 0.5757 (0.6218) loss 6.2041 (6.4498) grad_norm 2.1557 (3.3329) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:58:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][340/625] eta 0:02:57 lr 0.000012 wd 0.0500 time 0.5771 (0.6238) data time 0.0008 (0.0048) model time 0.5763 (0.6191) loss 6.1755 (6.4521) grad_norm 2.4740 (3.3018) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:58:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][350/625] eta 0:02:50 lr 0.000012 wd 0.0500 time 0.5728 (0.6212) data time 0.0006 (0.0046) model time 0.5722 (0.6167) loss 6.1602 (6.4269) grad_norm 2.9064 (3.3422) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:58:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][360/625] eta 0:02:44 lr 0.000012 wd 0.0500 time 0.5823 (0.6189) data time 0.0006 (0.0044) model time 0.5817 (0.6145) loss 5.7410 (6.4155) grad_norm 3.1419 (3.3658) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:58:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][370/625] eta 0:02:37 lr 0.000012 wd 0.0500 time 0.5833 (0.6170) data time 0.0006 (0.0042) model time 0.5827 (0.6128) loss 5.8163 (6.4030) grad_norm 2.4274 (3.3419) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:58:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][380/625] eta 0:02:30 lr 0.000012 wd 0.0500 time 0.5780 (0.6152) data time 0.0006 (0.0041) model time 0.5774 (0.6111) loss 6.3628 (6.4085) grad_norm 2.7204 (3.3308) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:58:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][390/625] eta 0:02:24 lr 0.000012 wd 0.0500 time 0.5773 (0.6136) data time 0.0006 (0.0039) model time 0.5767 (0.6097) loss 7.2778 (6.4172) grad_norm 3.2223 (3.3044) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:59:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][400/625] eta 0:02:17 lr 0.000012 wd 0.0500 time 0.5757 (0.6120) data time 0.0006 (0.0038) model time 0.5751 (0.6082) loss 5.5317 (6.4142) grad_norm 2.4317 (3.2865) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:59:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][410/625] eta 0:02:11 lr 0.000012 wd 0.0500 time 0.5717 (0.6104) data time 0.0009 (0.0037) model time 0.5708 (0.6068) loss 6.0006 (6.4057) grad_norm 2.8156 (3.2718) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:59:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][420/625] eta 0:02:04 lr 0.000012 wd 0.0500 time 0.5800 (0.6091) data time 0.0006 (0.0036) model time 0.5794 (0.6055) loss 6.6847 (6.4024) grad_norm 2.4596 (3.2719) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:59:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][430/625] eta 0:01:58 lr 0.000012 wd 0.0500 time 0.5731 (0.6086) data time 0.0009 (0.0035) model time 0.5722 (0.6051) loss 7.4610 (6.4055) grad_norm 2.5786 (3.2607) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:59:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][440/625] eta 0:01:52 lr 0.000012 wd 0.0500 time 0.5880 (0.6075) data time 0.0009 (0.0034) model time 0.5871 (0.6041) loss 6.2168 (6.4039) grad_norm 4.6704 (3.2538) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:59:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][450/625] eta 0:01:46 lr 0.000012 wd 0.0500 time 0.5771 (0.6064) data time 0.0007 (0.0033) model time 0.5765 (0.6030) loss 5.4411 (6.3946) grad_norm 2.5381 (3.2375) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:59:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][460/625] eta 0:01:39 lr 0.000012 wd 0.0500 time 0.5748 (0.6054) data time 0.0006 (0.0032) model time 0.5743 (0.6021) loss 6.4987 (6.3919) grad_norm 2.2120 (3.2240) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:59:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][470/625] eta 0:01:33 lr 0.000012 wd 0.0500 time 0.5707 (0.6044) data time 0.0006 (0.0032) model time 0.5701 (0.6012) loss 6.5971 (6.3974) grad_norm 3.4977 (3.2146) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:59:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][480/625] eta 0:01:27 lr 0.000012 wd 0.0500 time 0.5735 (0.6034) data time 0.0009 (0.0031) model time 0.5727 (0.6003) loss 7.4123 (6.4089) grad_norm 2.4602 (3.2072) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 11:59:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][490/625] eta 0:01:21 lr 0.000012 wd 0.0500 time 0.5732 (0.6025) data time 0.0009 (0.0030) model time 0.5723 (0.5995) loss 6.9094 (6.4061) grad_norm 3.2238 (3.2067) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 12:00:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][500/625] eta 0:01:15 lr 0.000012 wd 0.0500 time 0.5704 (0.6017) data time 0.0008 (0.0030) model time 0.5695 (0.5988) loss 7.4805 (6.4097) grad_norm 2.3990 (3.1929) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 12:00:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][510/625] eta 0:01:09 lr 0.000012 wd 0.0500 time 0.5768 (0.6010) data time 0.0006 (0.0029) model time 0.5762 (0.5981) loss 5.4716 (6.4052) grad_norm 2.9303 (3.1860) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 12:00:12 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][520/625] eta 0:01:03 lr 0.000012 wd 0.0500 time 0.5731 (0.6003) data time 0.0008 (0.0028) model time 0.5723 (0.5974) loss 7.1417 (6.4065) grad_norm 2.4347 (3.1928) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 12:00:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][530/625] eta 0:00:56 lr 0.000012 wd 0.0500 time 0.5797 (0.5996) data time 0.0006 (0.0028) model time 0.5791 (0.5969) loss 5.0183 (6.4019) grad_norm 2.0728 (3.1995) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 12:00:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][540/625] eta 0:00:50 lr 0.000012 wd 0.0500 time 0.5788 (0.5991) data time 0.0008 (0.0027) model time 0.5780 (0.5963) loss 7.0556 (6.3969) grad_norm 2.1339 (3.1888) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 12:00:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][550/625] eta 0:00:44 lr 0.000012 wd 0.0500 time 0.5959 (0.5985) data time 0.0006 (0.0027) model time 0.5953 (0.5958) loss 6.5598 (6.3955) grad_norm 2.5366 (3.1801) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 12:00:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][560/625] eta 0:00:38 lr 0.000012 wd 0.0500 time 0.5763 (0.5979) data time 0.0007 (0.0027) model time 0.5757 (0.5952) loss 6.7506 (6.3985) grad_norm 3.7582 (3.2012) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 12:00:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][570/625] eta 0:00:32 lr 0.000012 wd 0.0500 time 0.5753 (0.5973) data time 0.0006 (0.0027) model time 0.5747 (0.5946) loss 5.6106 (6.3963) grad_norm 3.7687 (3.2036) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 12:00:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][580/625] eta 0:00:26 lr 0.000012 wd 0.0500 time 0.5773 (0.5967) data time 0.0008 (0.0026) model time 0.5765 (0.5941) loss 6.5518 (6.3992) grad_norm 2.8896 (3.2078) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 12:00:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][590/625] eta 0:00:20 lr 0.000012 wd 0.0500 time 0.5742 (0.5962) data time 0.0008 (0.0026) model time 0.5734 (0.5936) loss 6.0599 (6.4048) grad_norm 4.2200 (3.2086) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 12:00:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][600/625] eta 0:00:14 lr 0.000012 wd 0.0500 time 0.5772 (0.5957) data time 0.0007 (0.0025) model time 0.5766 (0.5932) loss 6.3010 (6.4002) grad_norm 6.6368 (3.2395) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 12:01:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][610/625] eta 0:00:08 lr 0.000012 wd 0.0500 time 0.5749 (0.5952) data time 0.0004 (0.0025) model time 0.5745 (0.5927) loss 6.3654 (6.4006) grad_norm 4.2792 (3.2370) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 12:01:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [297/300][620/625] eta 0:00:02 lr 0.000012 wd 0.0500 time 0.5738 (0.5948) data time 0.0004 (0.0025) model time 0.5734 (0.5923) loss 5.9668 (6.3896) grad_norm 2.5237 (3.2369) loss_scale 256.0000 (256.0000) mem 22341MB +[2024-07-29 12:01:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 297 training takes 0:04:39 +[2024-07-29 12:01:11 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 12:01:15 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 12:01:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.476 (0.476) Loss 0.4888 (0.4888) Acc@1 90.381 (90.381) Acc@5 98.926 (98.926) Mem 22341MB +[2024-07-29 12:01:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.125 (0.157) Loss 0.7368 (0.5871) Acc@1 82.715 (88.272) Acc@5 97.217 (98.158) Mem 22341MB +[2024-07-29 12:01:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.142) Loss 0.8042 (0.6720) Acc@1 81.396 (85.779) Acc@5 96.387 (97.433) Mem 22341MB +[2024-07-29 12:01:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.401 Acc@5 97.417 +[2024-07-29 12:01:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.4% +[2024-07-29 12:01:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.993 (0.993) Loss 0.4883 (0.4883) Acc@1 90.527 (90.527) Acc@5 99.023 (99.023) Mem 22341MB +[2024-07-29 12:01:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.124 (0.204) Loss 0.7344 (0.5907) Acc@1 83.057 (88.303) Acc@5 97.168 (98.158) Mem 22341MB +[2024-07-29 12:01:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.125 (0.166) Loss 0.8086 (0.6749) Acc@1 81.348 (85.714) Acc@5 96.338 (97.421) Mem 22341MB +[2024-07-29 12:01:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.353 Acc@5 97.411 +[2024-07-29 12:01:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.4% +[2024-07-29 12:01:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.35% +[2024-07-29 12:01:24 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 12:01:26 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 12:01:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][0/625] eta 0:10:10 lr 0.000012 wd 0.0500 time 0.9770 (0.9770) data time 0.3279 (0.3279) model time 0.0000 (0.0000) loss 6.2031 (6.2031) grad_norm 9.6775 (9.6775) loss_scale 256.0000 (256.0000) mem 22337MB +[2024-07-29 12:01:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][10/625] eta 0:06:14 lr 0.000012 wd 0.0500 time 0.5703 (0.6087) data time 0.0006 (0.0306) model time 0.0000 (0.0000) loss 6.4809 (6.1319) grad_norm 2.7652 (3.3794) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:01:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][20/625] eta 0:06:04 lr 0.000012 wd 0.0500 time 0.7721 (0.6019) data time 0.0008 (0.0164) model time 0.0000 (0.0000) loss 6.3971 (6.3392) grad_norm 2.1822 (4.3385) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:01:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][30/625] eta 0:05:52 lr 0.000012 wd 0.0500 time 0.5685 (0.5925) data time 0.0009 (0.0114) model time 0.0000 (0.0000) loss 6.4162 (6.3349) grad_norm 2.2019 (3.7681) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:01:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][40/625] eta 0:05:44 lr 0.000012 wd 0.0500 time 0.5688 (0.5880) data time 0.0007 (0.0088) model time 0.0000 (0.0000) loss 7.3238 (6.4453) grad_norm 2.7890 (3.4847) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:01:56 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][50/625] eta 0:05:36 lr 0.000012 wd 0.0500 time 0.5718 (0.5852) data time 0.0007 (0.0073) model time 0.0000 (0.0000) loss 6.0142 (6.4394) grad_norm 1.8986 (3.3470) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:02:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][60/625] eta 0:05:29 lr 0.000012 wd 0.0500 time 0.5745 (0.5836) data time 0.0009 (0.0063) model time 0.5737 (0.5743) loss 6.3430 (6.3947) grad_norm 2.5628 (3.2029) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:02:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][70/625] eta 0:05:23 lr 0.000012 wd 0.0500 time 0.5682 (0.5822) data time 0.0008 (0.0055) model time 0.5673 (0.5737) loss 6.9522 (6.3788) grad_norm 2.7832 (3.5260) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:02:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][80/625] eta 0:05:16 lr 0.000012 wd 0.0500 time 0.5710 (0.5811) data time 0.0006 (0.0049) model time 0.5704 (0.5733) loss 6.9155 (6.3851) grad_norm 3.8766 (3.4596) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:02:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][90/625] eta 0:05:10 lr 0.000012 wd 0.0500 time 0.5667 (0.5803) data time 0.0008 (0.0045) model time 0.5659 (0.5731) loss 6.1395 (6.4222) grad_norm 3.6906 (3.3981) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:02:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][100/625] eta 0:05:04 lr 0.000012 wd 0.0500 time 0.5721 (0.5795) data time 0.0008 (0.0041) model time 0.5713 (0.5727) loss 7.4159 (6.4568) grad_norm 2.3085 (3.3199) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:02:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][110/625] eta 0:04:58 lr 0.000012 wd 0.0500 time 0.5696 (0.5788) data time 0.0006 (0.0038) model time 0.5689 (0.5725) loss 8.1966 (6.4859) grad_norm 4.3053 (3.2902) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:02:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][120/625] eta 0:04:52 lr 0.000012 wd 0.0500 time 0.5692 (0.5785) data time 0.0008 (0.0036) model time 0.5684 (0.5727) loss 6.9949 (6.4927) grad_norm 2.3606 (3.2972) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:02:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][130/625] eta 0:04:46 lr 0.000012 wd 0.0500 time 0.5761 (0.5782) data time 0.0008 (0.0034) model time 0.5753 (0.5728) loss 5.7606 (6.4818) grad_norm 1.9833 (3.2460) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:02:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][140/625] eta 0:04:40 lr 0.000012 wd 0.0500 time 0.5721 (0.5779) data time 0.0009 (0.0032) model time 0.5712 (0.5728) loss 6.6680 (6.4736) grad_norm 2.5833 (3.2315) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:02:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][150/625] eta 0:04:34 lr 0.000012 wd 0.0500 time 0.5722 (0.5776) data time 0.0006 (0.0030) model time 0.5716 (0.5729) loss 6.9750 (6.4801) grad_norm 2.2571 (3.2124) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:02:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][160/625] eta 0:04:28 lr 0.000012 wd 0.0500 time 0.5698 (0.5775) data time 0.0009 (0.0029) model time 0.5689 (0.5730) loss 6.8114 (6.4850) grad_norm 2.2392 (3.1886) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:03:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][170/625] eta 0:04:22 lr 0.000012 wd 0.0500 time 0.5710 (0.5772) data time 0.0007 (0.0028) model time 0.5703 (0.5729) loss 5.3629 (6.4757) grad_norm 2.4631 (3.1588) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:03:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][180/625] eta 0:04:16 lr 0.000012 wd 0.0500 time 0.5659 (0.5771) data time 0.0007 (0.0027) model time 0.5652 (0.5730) loss 6.2893 (6.4601) grad_norm 1.9742 (3.1244) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:03:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][190/625] eta 0:04:11 lr 0.000012 wd 0.0500 time 0.5754 (0.5772) data time 0.0007 (0.0026) model time 0.5748 (0.5734) loss 5.9942 (6.4336) grad_norm 2.7410 (3.2374) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:03:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][200/625] eta 0:04:05 lr 0.000012 wd 0.0500 time 0.5747 (0.5772) data time 0.0006 (0.0025) model time 0.5740 (0.5735) loss 7.2266 (6.4505) grad_norm 4.3636 (3.2180) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:03:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 379): INFO Suspend command received, saving checkpoint and exiting +[2024-07-29 12:03:26 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 12:03:27 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 12:05:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 529): INFO Full config saved to ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/config.json +[2024-07-29 12:05:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 129): INFO Creating model:vmamba2/vssd_mesa_retrain_base_e300 +[2024-07-29 12:06:13 vssd_mesa_retrain_base_e300] (optimizer.py 18): INFO ==============> building optimizer adamw.................... +[2024-07-29 12:06:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 193): INFO auto resuming from ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth +[2024-07-29 12:06:26 vssd_mesa_retrain_base_e300] (utils.py 21): INFO ==============> Resuming form ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth.................... +[2024-07-29 12:06:27 vssd_mesa_retrain_base_e300] (utils.py 30): INFO resuming model: +[2024-07-29 12:06:27 vssd_mesa_retrain_base_e300] (utils.py 37): INFO resuming model_ema: +[2024-07-29 12:06:27 vssd_mesa_retrain_base_e300] (utils.py 61): INFO => loaded successfully './exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth' (epoch 298) +[2024-07-29 12:06:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 233): INFO Start training +[2024-07-29 12:06:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][210/625] eta 0:26:11 lr 0.000012 wd 0.0500 time 0.5183 (3.7872) data time 0.0008 (0.3592) model time 0.5175 (3.4280) loss 7.0397 (6.7967) grad_norm 1.8709 (2.2029) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 12:06:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][220/625] eta 0:09:48 lr 0.000012 wd 0.0500 time 0.5173 (1.4528) data time 0.0009 (0.1034) model time 0.5164 (1.3494) loss 6.4583 (6.7377) grad_norm 2.5230 (2.7491) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 12:06:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][230/625] eta 0:06:59 lr 0.000012 wd 0.0500 time 0.5171 (1.0632) data time 0.0011 (0.0607) model time 0.5161 (1.0025) loss 6.2863 (6.6890) grad_norm 2.7798 (2.6909) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 12:07:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][240/625] eta 0:05:47 lr 0.000012 wd 0.0500 time 0.5178 (0.9032) data time 0.0008 (0.0432) model time 0.5170 (0.8600) loss 5.9784 (6.6976) grad_norm 2.2409 (2.8012) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 12:07:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][250/625] eta 0:05:05 lr 0.000012 wd 0.0500 time 0.5195 (0.8156) data time 0.0008 (0.0336) model time 0.5187 (0.7820) loss 6.4595 (6.6658) grad_norm 2.1986 (2.8977) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 12:07:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][260/625] eta 0:04:39 lr 0.000012 wd 0.0500 time 0.5328 (0.7652) data time 0.0008 (0.0276) model time 0.5320 (0.7376) loss 6.6339 (6.6257) grad_norm 2.6404 (2.8422) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 12:07:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][270/625] eta 0:04:19 lr 0.000012 wd 0.0500 time 0.5158 (0.7301) data time 0.0008 (0.0234) model time 0.5150 (0.7067) loss 6.2785 (6.5674) grad_norm 2.0262 (2.7966) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 12:07:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][280/625] eta 0:04:01 lr 0.000012 wd 0.0500 time 0.5177 (0.7014) data time 0.0008 (0.0204) model time 0.5169 (0.6810) loss 7.2386 (6.5711) grad_norm 2.6475 (2.8408) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 12:07:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][290/625] eta 0:03:47 lr 0.000012 wd 0.0500 time 0.5160 (0.6795) data time 0.0012 (0.0181) model time 0.5147 (0.6614) loss 7.2608 (6.5602) grad_norm 1.7840 (2.9105) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 12:07:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][300/625] eta 0:03:35 lr 0.000012 wd 0.0500 time 0.5180 (0.6623) data time 0.0011 (0.0163) model time 0.5169 (0.6460) loss 5.6789 (6.5148) grad_norm 18.2120 (3.2236) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 12:07:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][310/625] eta 0:03:24 lr 0.000012 wd 0.0500 time 0.5163 (0.6485) data time 0.0011 (0.0148) model time 0.5152 (0.6337) loss 6.4414 (6.5619) grad_norm 2.0124 (3.2194) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 12:07:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][320/625] eta 0:03:14 lr 0.000012 wd 0.0500 time 0.5166 (0.6370) data time 0.0011 (0.0136) model time 0.5155 (0.6234) loss 7.0940 (6.5653) grad_norm 2.0938 (3.1833) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 12:07:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][330/625] eta 0:03:05 lr 0.000012 wd 0.0500 time 0.5166 (0.6274) data time 0.0010 (0.0126) model time 0.5156 (0.6148) loss 5.7425 (6.5489) grad_norm 2.4960 (3.1501) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 12:07:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][340/625] eta 0:02:56 lr 0.000012 wd 0.0500 time 0.5168 (0.6192) data time 0.0010 (0.0117) model time 0.5157 (0.6075) loss 7.8681 (6.5685) grad_norm 2.2221 (3.1340) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 12:08:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][350/625] eta 0:02:48 lr 0.000012 wd 0.0500 time 0.5169 (0.6123) data time 0.0008 (0.0110) model time 0.5161 (0.6013) loss 6.1717 (6.5443) grad_norm 2.1068 (3.1402) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 12:08:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][360/625] eta 0:02:40 lr 0.000012 wd 0.0500 time 0.5167 (0.6061) data time 0.0008 (0.0103) model time 0.5159 (0.5957) loss 6.2009 (6.5381) grad_norm 2.0214 (3.1077) loss_scale 256.0000 (256.0000) mem 22344MB +[2024-07-29 12:08:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][370/625] eta 0:02:33 lr 0.000012 wd 0.0500 time 0.5154 (0.6007) data time 0.0007 (0.0098) model time 0.5147 (0.5909) loss 6.1072 (6.5322) grad_norm 2.1971 (3.0821) loss_scale 512.0000 (257.5610) mem 22344MB +[2024-07-29 12:08:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][380/625] eta 0:02:26 lr 0.000012 wd 0.0500 time 0.5217 (0.5960) data time 0.0010 (0.0093) model time 0.5207 (0.5867) loss 5.3151 (6.5457) grad_norm 2.0712 (3.0662) loss_scale 512.0000 (272.1839) mem 22344MB +[2024-07-29 12:08:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][390/625] eta 0:02:19 lr 0.000012 wd 0.0500 time 0.5163 (0.5918) data time 0.0009 (0.0089) model time 0.5155 (0.5829) loss 5.9319 (6.5272) grad_norm 4.0958 (3.0631) loss_scale 512.0000 (285.2174) mem 22344MB +[2024-07-29 12:08:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][400/625] eta 0:02:12 lr 0.000012 wd 0.0500 time 0.5172 (0.5880) data time 0.0008 (0.0085) model time 0.5165 (0.5795) loss 5.8374 (6.5120) grad_norm 3.5870 (3.0724) loss_scale 512.0000 (296.9072) mem 22344MB +[2024-07-29 12:08:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][410/625] eta 0:02:05 lr 0.000012 wd 0.0500 time 0.5176 (0.5846) data time 0.0009 (0.0081) model time 0.5167 (0.5765) loss 6.7728 (6.4887) grad_norm 2.5356 (3.0671) loss_scale 512.0000 (307.4510) mem 22344MB +[2024-07-29 12:08:36 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][420/625] eta 0:01:59 lr 0.000012 wd 0.0500 time 0.5193 (0.5815) data time 0.0008 (0.0078) model time 0.5186 (0.5738) loss 6.5873 (6.4804) grad_norm 3.9816 (3.0821) loss_scale 512.0000 (317.0093) mem 22344MB +[2024-07-29 12:08:41 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][430/625] eta 0:01:52 lr 0.000012 wd 0.0500 time 0.5187 (0.5787) data time 0.0010 (0.0075) model time 0.5177 (0.5712) loss 7.6776 (6.4833) grad_norm 2.6296 (3.0740) loss_scale 512.0000 (325.7143) mem 22344MB +[2024-07-29 12:08:46 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][440/625] eta 0:01:46 lr 0.000012 wd 0.0500 time 0.5168 (0.5761) data time 0.0010 (0.0072) model time 0.5158 (0.5689) loss 5.6251 (6.4731) grad_norm 3.1535 (3.0576) loss_scale 512.0000 (333.6752) mem 22344MB +[2024-07-29 12:08:51 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][450/625] eta 0:01:40 lr 0.000012 wd 0.0500 time 0.5168 (0.5737) data time 0.0010 (0.0069) model time 0.5159 (0.5668) loss 5.6831 (6.4744) grad_norm 2.3397 (3.0415) loss_scale 512.0000 (340.9836) mem 22344MB +[2024-07-29 12:08:57 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][460/625] eta 0:01:34 lr 0.000012 wd 0.0500 time 0.5252 (0.5716) data time 0.0007 (0.0067) model time 0.5244 (0.5649) loss 5.8565 (6.4685) grad_norm 2.9534 (3.0282) loss_scale 512.0000 (347.7165) mem 22344MB +[2024-07-29 12:09:02 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][470/625] eta 0:01:28 lr 0.000012 wd 0.0500 time 0.5167 (0.5696) data time 0.0007 (0.0065) model time 0.5160 (0.5631) loss 5.6900 (6.4583) grad_norm 2.5377 (3.0128) loss_scale 512.0000 (353.9394) mem 22344MB +[2024-07-29 12:09:07 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][480/625] eta 0:01:22 lr 0.000012 wd 0.0500 time 0.7187 (0.5691) data time 0.0011 (0.0063) model time 0.7176 (0.5628) loss 7.1155 (6.4638) grad_norm 2.6606 (3.0086) loss_scale 512.0000 (359.7080) mem 22344MB +[2024-07-29 12:09:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][490/625] eta 0:01:16 lr 0.000012 wd 0.0500 time 0.5171 (0.5673) data time 0.0008 (0.0061) model time 0.5163 (0.5612) loss 5.1512 (6.4637) grad_norm 2.4963 (3.0236) loss_scale 512.0000 (365.0704) mem 22344MB +[2024-07-29 12:09:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][500/625] eta 0:01:10 lr 0.000012 wd 0.0500 time 0.5169 (0.5662) data time 0.0008 (0.0059) model time 0.5162 (0.5603) loss 6.3535 (6.4633) grad_norm 2.3261 (3.0144) loss_scale 512.0000 (370.0680) mem 22344MB +[2024-07-29 12:09:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][510/625] eta 0:01:04 lr 0.000012 wd 0.0500 time 0.5167 (0.5646) data time 0.0011 (0.0058) model time 0.5155 (0.5589) loss 7.0275 (6.4570) grad_norm 2.5201 (3.0222) loss_scale 512.0000 (374.7368) mem 22344MB +[2024-07-29 12:09:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][520/625] eta 0:00:59 lr 0.000012 wd 0.0500 time 0.5164 (0.5632) data time 0.0011 (0.0056) model time 0.5153 (0.5575) loss 7.0369 (6.4711) grad_norm 4.1947 (3.0227) loss_scale 512.0000 (379.1083) mem 22344MB +[2024-07-29 12:09:33 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][530/625] eta 0:00:53 lr 0.000012 wd 0.0500 time 0.5183 (0.5619) data time 0.0007 (0.0055) model time 0.5176 (0.5564) loss 7.2033 (6.4851) grad_norm 2.8115 (3.0121) loss_scale 512.0000 (383.2099) mem 22344MB +[2024-07-29 12:09:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][540/625] eta 0:00:47 lr 0.000012 wd 0.0500 time 0.5169 (0.5606) data time 0.0010 (0.0053) model time 0.5159 (0.5553) loss 6.5297 (6.4878) grad_norm 2.2093 (3.0038) loss_scale 512.0000 (387.0659) mem 22344MB +[2024-07-29 12:09:44 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][550/625] eta 0:00:41 lr 0.000012 wd 0.0500 time 0.5222 (0.5594) data time 0.0010 (0.0052) model time 0.5212 (0.5542) loss 7.6509 (6.4990) grad_norm 2.8696 (3.0001) loss_scale 512.0000 (390.6977) mem 22344MB +[2024-07-29 12:09:49 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][560/625] eta 0:00:36 lr 0.000012 wd 0.0500 time 0.5174 (0.5582) data time 0.0012 (0.0051) model time 0.5162 (0.5531) loss 6.6475 (6.4958) grad_norm 2.3564 (inf) loss_scale 256.0000 (390.5085) mem 22344MB +[2024-07-29 12:09:54 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][570/625] eta 0:00:30 lr 0.000012 wd 0.0500 time 0.5181 (0.5571) data time 0.0008 (0.0050) model time 0.5174 (0.5521) loss 5.2349 (6.4941) grad_norm 3.0704 (inf) loss_scale 256.0000 (386.8132) mem 22344MB +[2024-07-29 12:09:59 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][580/625] eta 0:00:25 lr 0.000012 wd 0.0500 time 0.5220 (0.5561) data time 0.0010 (0.0049) model time 0.5210 (0.5512) loss 7.2436 (6.4992) grad_norm 2.7763 (inf) loss_scale 256.0000 (383.3155) mem 22344MB +[2024-07-29 12:10:05 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][590/625] eta 0:00:19 lr 0.000012 wd 0.0500 time 0.5193 (0.5552) data time 0.0010 (0.0048) model time 0.5184 (0.5504) loss 5.9902 (6.4869) grad_norm 2.9283 (inf) loss_scale 256.0000 (380.0000) mem 22344MB +[2024-07-29 12:10:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][600/625] eta 0:00:13 lr 0.000012 wd 0.0500 time 0.5174 (0.5543) data time 0.0010 (0.0047) model time 0.5164 (0.5496) loss 6.6928 (6.4786) grad_norm 3.3927 (inf) loss_scale 256.0000 (376.8528) mem 22344MB +[2024-07-29 12:10:15 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][610/625] eta 0:00:08 lr 0.000012 wd 0.0500 time 0.5220 (0.5534) data time 0.0005 (0.0046) model time 0.5215 (0.5488) loss 6.7883 (6.4772) grad_norm 2.1871 (inf) loss_scale 256.0000 (373.8614) mem 22344MB +[2024-07-29 12:10:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [298/300][620/625] eta 0:00:02 lr 0.000012 wd 0.0500 time 0.5143 (0.5525) data time 0.0007 (0.0045) model time 0.5135 (0.5480) loss 6.3142 (6.4781) grad_norm 2.8354 (inf) loss_scale 256.0000 (371.0145) mem 22344MB +[2024-07-29 12:10:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 298 training takes 0:03:50 +[2024-07-29 12:10:22 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 12:10:28 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 12:10:28 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.582 (0.582) Loss 0.4849 (0.4849) Acc@1 90.576 (90.576) Acc@5 99.023 (99.023) Mem 22344MB +[2024-07-29 12:10:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.160) Loss 0.7407 (0.5878) Acc@1 83.154 (88.308) Acc@5 97.070 (98.176) Mem 22344MB +[2024-07-29 12:10:31 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.139) Loss 0.8047 (0.6730) Acc@1 81.494 (85.777) Acc@5 96.582 (97.445) Mem 22344MB +[2024-07-29 12:10:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.435 Acc@5 97.429 +[2024-07-29 12:10:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.4% +[2024-07-29 12:10:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 265): INFO New max accuracy: 85.43% +[2024-07-29 12:10:34 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saving...... +[2024-07-29 12:10:37 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt.pth saved !!! +[2024-07-29 12:10:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.581 (0.581) Loss 0.4880 (0.4880) Acc@1 90.527 (90.527) Acc@5 99.023 (99.023) Mem 22344MB +[2024-07-29 12:10:39 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.159) Loss 0.7344 (0.5904) Acc@1 82.959 (88.308) Acc@5 97.168 (98.167) Mem 22344MB +[2024-07-29 12:10:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.139) Loss 0.8086 (0.6747) Acc@1 81.445 (85.731) Acc@5 96.289 (97.428) Mem 22344MB +[2024-07-29 12:10:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.369 Acc@5 97.417 +[2024-07-29 12:10:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.4% +[2024-07-29 12:10:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.37% +[2024-07-29 12:10:40 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 12:10:44 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 12:10:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][0/625] eta 0:12:57 lr 0.000012 wd 0.0500 time 1.2444 (1.2444) data time 0.5115 (0.5115) model time 0.0000 (0.0000) loss 4.9669 (4.9669) grad_norm 3.6614 (3.6614) loss_scale 256.0000 (256.0000) mem 22337MB +[2024-07-29 12:10:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][10/625] eta 0:05:59 lr 0.000012 wd 0.0500 time 0.5162 (0.5851) data time 0.0007 (0.0473) model time 0.0000 (0.0000) loss 6.2028 (6.4503) grad_norm 2.4207 (3.3327) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:10:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][20/625] eta 0:05:34 lr 0.000012 wd 0.0500 time 0.5166 (0.5536) data time 0.0012 (0.0253) model time 0.0000 (0.0000) loss 6.8197 (6.5223) grad_norm 2.7690 (3.4546) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:11:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][30/625] eta 0:05:22 lr 0.000012 wd 0.0500 time 0.5162 (0.5423) data time 0.0011 (0.0175) model time 0.0000 (0.0000) loss 7.1891 (6.4675) grad_norm 2.5400 (3.1452) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:11:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][40/625] eta 0:05:14 lr 0.000012 wd 0.0500 time 0.5218 (0.5368) data time 0.0010 (0.0134) model time 0.0000 (0.0000) loss 6.2804 (6.3551) grad_norm 2.3476 (3.2443) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:11:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][50/625] eta 0:05:06 lr 0.000012 wd 0.0500 time 0.5166 (0.5332) data time 0.0007 (0.0111) model time 0.0000 (0.0000) loss 5.7290 (6.3357) grad_norm 2.4839 (3.1421) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:11:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][60/625] eta 0:04:59 lr 0.000012 wd 0.0500 time 0.5177 (0.5307) data time 0.0010 (0.0094) model time 0.5167 (0.5170) loss 6.8931 (6.3467) grad_norm 2.1819 (3.4346) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:11:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][70/625] eta 0:04:53 lr 0.000012 wd 0.0500 time 0.5152 (0.5288) data time 0.0011 (0.0082) model time 0.5141 (0.5166) loss 6.7707 (6.4033) grad_norm 4.2183 (3.4592) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:11:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][80/625] eta 0:04:48 lr 0.000012 wd 0.0500 time 0.5196 (0.5301) data time 0.0008 (0.0074) model time 0.5188 (0.5239) loss 6.0202 (6.3979) grad_norm 2.3066 (3.4288) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:11:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][90/625] eta 0:04:42 lr 0.000012 wd 0.0500 time 0.5175 (0.5288) data time 0.0008 (0.0067) model time 0.5167 (0.5221) loss 7.0414 (6.4210) grad_norm 2.9935 (3.3907) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:11:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][100/625] eta 0:04:37 lr 0.000012 wd 0.0500 time 0.5218 (0.5278) data time 0.0010 (0.0061) model time 0.5208 (0.5213) loss 6.0713 (6.4246) grad_norm 2.8235 (3.3993) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:11:43 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][110/625] eta 0:04:32 lr 0.000012 wd 0.0500 time 0.5194 (0.5290) data time 0.0008 (0.0056) model time 0.5186 (0.5243) loss 6.4742 (6.4125) grad_norm 2.2487 (3.3283) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:11:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][120/625] eta 0:04:26 lr 0.000012 wd 0.0500 time 0.5171 (0.5282) data time 0.0010 (0.0053) model time 0.5161 (0.5235) loss 6.6863 (6.4071) grad_norm 4.0065 (3.2825) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:11:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][130/625] eta 0:04:21 lr 0.000012 wd 0.0500 time 0.5235 (0.5276) data time 0.0008 (0.0049) model time 0.5227 (0.5229) loss 6.7262 (6.3952) grad_norm 2.5105 (3.2269) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:11:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][140/625] eta 0:04:15 lr 0.000012 wd 0.0500 time 0.5174 (0.5270) data time 0.0009 (0.0047) model time 0.5165 (0.5225) loss 6.4405 (6.4130) grad_norm 2.8187 (3.1992) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:12:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][150/625] eta 0:04:10 lr 0.000012 wd 0.0500 time 0.5174 (0.5265) data time 0.0010 (0.0044) model time 0.5163 (0.5220) loss 5.7176 (6.4369) grad_norm 4.2046 (3.1877) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:12:09 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][160/625] eta 0:04:04 lr 0.000012 wd 0.0500 time 0.5160 (0.5260) data time 0.0008 (0.0042) model time 0.5152 (0.5217) loss 7.2679 (6.4607) grad_norm 2.5228 (3.1721) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:12:14 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][170/625] eta 0:03:59 lr 0.000012 wd 0.0500 time 0.5190 (0.5256) data time 0.0010 (0.0040) model time 0.5179 (0.5214) loss 6.0783 (6.4794) grad_norm 3.6496 (3.2151) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:12:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][180/625] eta 0:03:53 lr 0.000012 wd 0.0500 time 0.5260 (0.5252) data time 0.0010 (0.0039) model time 0.5250 (0.5211) loss 6.0517 (6.4752) grad_norm 2.2963 (3.1952) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:12:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][190/625] eta 0:03:48 lr 0.000012 wd 0.0500 time 0.5174 (0.5250) data time 0.0008 (0.0037) model time 0.5166 (0.5210) loss 7.0568 (6.4741) grad_norm 2.3142 (3.1737) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:12:30 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][200/625] eta 0:03:43 lr 0.000012 wd 0.0500 time 0.7216 (0.5256) data time 0.0008 (0.0036) model time 0.7208 (0.5221) loss 6.2719 (6.4788) grad_norm 3.0740 (3.1621) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:12:35 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][210/625] eta 0:03:37 lr 0.000012 wd 0.0500 time 0.5175 (0.5253) data time 0.0010 (0.0034) model time 0.5165 (0.5217) loss 6.0466 (6.4839) grad_norm 2.5843 (3.1335) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:12:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][220/625] eta 0:03:32 lr 0.000012 wd 0.0500 time 0.5169 (0.5250) data time 0.0008 (0.0033) model time 0.5162 (0.5215) loss 6.2485 (6.4841) grad_norm 2.2535 (3.0998) loss_scale 256.0000 (256.0000) mem 22339MB +[2024-07-29 12:12:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][230/625] eta 0:03:27 lr 0.000012 wd 0.0500 time 0.5158 (0.5246) data time 0.0008 (0.0032) model time 0.5151 (0.5212) loss 6.0394 (6.4646) grad_norm 2.3479 (inf) loss_scale 128.0000 (250.4589) mem 22339MB +[2024-07-29 12:12:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][240/625] eta 0:03:21 lr 0.000012 wd 0.0500 time 0.5193 (0.5244) data time 0.0010 (0.0031) model time 0.5183 (0.5210) loss 6.2725 (6.4438) grad_norm 2.3129 (inf) loss_scale 128.0000 (245.3776) mem 22339MB +[2024-07-29 12:12:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][250/625] eta 0:03:16 lr 0.000012 wd 0.0500 time 0.5171 (0.5241) data time 0.0008 (0.0031) model time 0.5164 (0.5208) loss 6.5560 (6.4453) grad_norm 3.5360 (inf) loss_scale 128.0000 (240.7012) mem 22339MB +[2024-07-29 12:13:01 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][260/625] eta 0:03:11 lr 0.000012 wd 0.0500 time 0.5177 (0.5239) data time 0.0008 (0.0030) model time 0.5169 (0.5206) loss 6.2914 (6.4560) grad_norm 5.2692 (inf) loss_scale 128.0000 (236.3831) mem 22339MB +[2024-07-29 12:13:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][270/625] eta 0:03:05 lr 0.000012 wd 0.0500 time 0.5162 (0.5237) data time 0.0010 (0.0029) model time 0.5153 (0.5205) loss 6.1086 (6.4484) grad_norm 2.2595 (inf) loss_scale 128.0000 (232.3838) mem 22339MB +[2024-07-29 12:13:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][280/625] eta 0:03:00 lr 0.000012 wd 0.0500 time 0.5175 (0.5235) data time 0.0011 (0.0028) model time 0.5164 (0.5204) loss 5.7300 (6.4425) grad_norm 2.3471 (inf) loss_scale 128.0000 (228.6690) mem 22339MB +[2024-07-29 12:13:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][290/625] eta 0:02:55 lr 0.000012 wd 0.0500 time 0.5198 (0.5233) data time 0.0010 (0.0028) model time 0.5188 (0.5202) loss 5.5554 (6.4346) grad_norm 2.5977 (inf) loss_scale 128.0000 (225.2096) mem 22339MB +[2024-07-29 12:13:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][300/625] eta 0:02:50 lr 0.000012 wd 0.0500 time 0.5176 (0.5238) data time 0.0008 (0.0027) model time 0.5168 (0.5209) loss 6.7137 (6.4386) grad_norm 2.0545 (inf) loss_scale 128.0000 (221.9801) mem 22339MB +[2024-07-29 12:13:27 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][310/625] eta 0:02:44 lr 0.000012 wd 0.0500 time 0.5187 (0.5237) data time 0.0008 (0.0027) model time 0.5180 (0.5208) loss 7.6313 (6.4372) grad_norm 5.0111 (inf) loss_scale 128.0000 (218.9582) mem 22339MB +[2024-07-29 12:13:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][320/625] eta 0:02:39 lr 0.000012 wd 0.0500 time 0.5156 (0.5235) data time 0.0011 (0.0026) model time 0.5145 (0.5206) loss 7.0331 (6.4500) grad_norm 3.7421 (inf) loss_scale 128.0000 (216.1246) mem 22339MB +[2024-07-29 12:13:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][330/625] eta 0:02:34 lr 0.000012 wd 0.0500 time 0.5171 (0.5234) data time 0.0010 (0.0026) model time 0.5160 (0.5206) loss 6.7604 (6.4536) grad_norm 3.9573 (inf) loss_scale 128.0000 (213.4622) mem 22339MB +[2024-07-29 12:13:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][340/625] eta 0:02:29 lr 0.000012 wd 0.0500 time 0.5166 (0.5233) data time 0.0010 (0.0025) model time 0.5156 (0.5205) loss 7.1249 (6.4505) grad_norm 2.3859 (inf) loss_scale 128.0000 (210.9560) mem 22339MB +[2024-07-29 12:13:48 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][350/625] eta 0:02:23 lr 0.000012 wd 0.0500 time 0.5358 (0.5232) data time 0.0007 (0.0025) model time 0.5351 (0.5205) loss 6.0656 (6.4537) grad_norm 10.0210 (inf) loss_scale 128.0000 (208.5926) mem 22339MB +[2024-07-29 12:13:53 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][360/625] eta 0:02:18 lr 0.000012 wd 0.0500 time 0.5242 (0.5231) data time 0.0011 (0.0024) model time 0.5231 (0.5204) loss 5.4129 (6.4550) grad_norm 6.1683 (inf) loss_scale 128.0000 (206.3601) mem 22339MB +[2024-07-29 12:13:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][370/625] eta 0:02:13 lr 0.000012 wd 0.0500 time 0.5177 (0.5230) data time 0.0011 (0.0024) model time 0.5165 (0.5203) loss 5.8852 (6.4595) grad_norm 5.9435 (inf) loss_scale 128.0000 (204.2480) mem 22339MB +[2024-07-29 12:14:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][380/625] eta 0:02:08 lr 0.000012 wd 0.0500 time 0.5191 (0.5228) data time 0.0011 (0.0024) model time 0.5180 (0.5202) loss 7.1039 (6.4613) grad_norm 2.5048 (inf) loss_scale 128.0000 (202.2467) mem 22339MB +[2024-07-29 12:14:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][390/625] eta 0:02:02 lr 0.000012 wd 0.0500 time 0.5164 (0.5228) data time 0.0008 (0.0023) model time 0.5157 (0.5202) loss 5.1839 (6.4485) grad_norm 2.4479 (inf) loss_scale 128.0000 (200.3478) mem 22339MB +[2024-07-29 12:14:13 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][400/625] eta 0:01:57 lr 0.000012 wd 0.0500 time 0.5198 (0.5227) data time 0.0010 (0.0023) model time 0.5188 (0.5201) loss 7.1106 (6.4487) grad_norm 2.7315 (inf) loss_scale 128.0000 (198.5436) mem 22339MB +[2024-07-29 12:14:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][410/625] eta 0:01:52 lr 0.000012 wd 0.0500 time 0.5188 (0.5226) data time 0.0008 (0.0023) model time 0.5180 (0.5201) loss 6.1416 (6.4393) grad_norm 3.3415 (inf) loss_scale 128.0000 (196.8273) mem 22339MB +[2024-07-29 12:14:24 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][420/625] eta 0:01:47 lr 0.000012 wd 0.0500 time 0.5175 (0.5229) data time 0.0008 (0.0022) model time 0.5167 (0.5205) loss 6.7834 (6.4360) grad_norm 2.9468 (inf) loss_scale 128.0000 (195.1924) mem 22339MB +[2024-07-29 12:14:29 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][430/625] eta 0:01:41 lr 0.000012 wd 0.0500 time 0.5177 (0.5228) data time 0.0009 (0.0022) model time 0.5167 (0.5204) loss 6.1388 (6.4275) grad_norm 2.4223 (inf) loss_scale 128.0000 (193.6334) mem 22339MB +[2024-07-29 12:14:34 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][440/625] eta 0:01:36 lr 0.000012 wd 0.0500 time 0.5173 (0.5227) data time 0.0008 (0.0022) model time 0.5165 (0.5203) loss 6.4172 (6.4326) grad_norm 2.7487 (inf) loss_scale 128.0000 (192.1451) mem 22339MB +[2024-07-29 12:14:40 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][450/625] eta 0:01:31 lr 0.000012 wd 0.0500 time 0.5229 (0.5226) data time 0.0010 (0.0021) model time 0.5218 (0.5203) loss 6.5749 (6.4322) grad_norm 4.4970 (inf) loss_scale 128.0000 (190.7228) mem 22339MB +[2024-07-29 12:14:45 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][460/625] eta 0:01:26 lr 0.000012 wd 0.0500 time 0.5177 (0.5225) data time 0.0007 (0.0021) model time 0.5170 (0.5202) loss 6.1095 (6.4296) grad_norm 2.3880 (inf) loss_scale 128.0000 (189.3623) mem 22339MB +[2024-07-29 12:14:50 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][470/625] eta 0:01:20 lr 0.000012 wd 0.0500 time 0.5214 (0.5224) data time 0.0008 (0.0021) model time 0.5206 (0.5201) loss 6.1875 (6.4252) grad_norm 3.0472 (inf) loss_scale 128.0000 (188.0594) mem 22339MB +[2024-07-29 12:14:55 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][480/625] eta 0:01:15 lr 0.000012 wd 0.0500 time 0.5177 (0.5224) data time 0.0010 (0.0021) model time 0.5167 (0.5201) loss 5.7765 (6.4183) grad_norm 2.6749 (inf) loss_scale 128.0000 (186.8108) mem 22339MB +[2024-07-29 12:15:00 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][490/625] eta 0:01:10 lr 0.000012 wd 0.0500 time 0.5173 (0.5223) data time 0.0009 (0.0020) model time 0.5164 (0.5201) loss 6.7965 (6.4212) grad_norm 3.0092 (inf) loss_scale 128.0000 (185.6130) mem 22339MB +[2024-07-29 12:15:06 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][500/625] eta 0:01:05 lr 0.000012 wd 0.0500 time 0.5282 (0.5222) data time 0.0008 (0.0020) model time 0.5274 (0.5200) loss 6.9216 (6.4220) grad_norm 1.9356 (inf) loss_scale 128.0000 (184.4631) mem 22339MB +[2024-07-29 12:15:11 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][510/625] eta 0:01:00 lr 0.000012 wd 0.0500 time 0.5159 (0.5221) data time 0.0007 (0.0020) model time 0.5151 (0.5199) loss 6.5011 (6.4257) grad_norm 2.5654 (inf) loss_scale 128.0000 (183.3581) mem 22339MB +[2024-07-29 12:15:16 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][520/625] eta 0:00:54 lr 0.000012 wd 0.0500 time 0.5234 (0.5225) data time 0.0011 (0.0020) model time 0.5223 (0.5203) loss 6.3867 (6.4268) grad_norm 1.9673 (inf) loss_scale 128.0000 (182.2956) mem 22339MB +[2024-07-29 12:15:21 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][530/625] eta 0:00:49 lr 0.000012 wd 0.0500 time 0.5171 (0.5224) data time 0.0009 (0.0020) model time 0.5162 (0.5202) loss 5.7087 (6.4228) grad_norm 3.3408 (inf) loss_scale 128.0000 (181.2731) mem 22339MB +[2024-07-29 12:15:26 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][540/625] eta 0:00:44 lr 0.000012 wd 0.0500 time 0.5171 (0.5223) data time 0.0008 (0.0020) model time 0.5164 (0.5201) loss 5.4874 (6.4196) grad_norm 3.3656 (inf) loss_scale 128.0000 (180.2884) mem 22339MB +[2024-07-29 12:15:32 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][550/625] eta 0:00:39 lr 0.000012 wd 0.0500 time 0.5186 (0.5222) data time 0.0010 (0.0019) model time 0.5176 (0.5201) loss 7.4304 (6.4242) grad_norm 2.9639 (inf) loss_scale 128.0000 (179.3394) mem 22339MB +[2024-07-29 12:15:37 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][560/625] eta 0:00:33 lr 0.000012 wd 0.0500 time 0.5183 (0.5221) data time 0.0010 (0.0019) model time 0.5173 (0.5200) loss 6.5216 (6.4254) grad_norm 2.0937 (inf) loss_scale 128.0000 (178.4242) mem 22339MB +[2024-07-29 12:15:42 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][570/625] eta 0:00:28 lr 0.000012 wd 0.0500 time 0.5184 (0.5221) data time 0.0008 (0.0019) model time 0.5176 (0.5200) loss 7.0103 (6.4293) grad_norm 2.0396 (inf) loss_scale 128.0000 (177.5412) mem 22339MB +[2024-07-29 12:15:47 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][580/625] eta 0:00:23 lr 0.000012 wd 0.0500 time 0.5167 (0.5220) data time 0.0009 (0.0019) model time 0.5157 (0.5200) loss 6.8362 (6.4295) grad_norm 6.5263 (inf) loss_scale 128.0000 (176.6885) mem 22339MB +[2024-07-29 12:15:52 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][590/625] eta 0:00:18 lr 0.000012 wd 0.0500 time 0.5153 (0.5220) data time 0.0009 (0.0019) model time 0.5144 (0.5200) loss 6.2224 (6.4244) grad_norm 2.4897 (inf) loss_scale 128.0000 (175.8646) mem 22339MB +[2024-07-29 12:15:58 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][600/625] eta 0:00:13 lr 0.000012 wd 0.0500 time 0.5175 (0.5220) data time 0.0007 (0.0019) model time 0.5168 (0.5199) loss 5.5476 (6.4232) grad_norm 1.9410 (inf) loss_scale 128.0000 (175.0682) mem 22339MB +[2024-07-29 12:16:03 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][610/625] eta 0:00:07 lr 0.000012 wd 0.0500 time 0.5131 (0.5219) data time 0.0008 (0.0018) model time 0.5124 (0.5199) loss 6.2435 (6.4171) grad_norm 2.5751 (inf) loss_scale 128.0000 (174.2979) mem 22339MB +[2024-07-29 12:16:08 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 367): INFO Train: [299/300][620/625] eta 0:00:02 lr 0.000012 wd 0.0500 time 0.5269 (0.5218) data time 0.0008 (0.0018) model time 0.5261 (0.5198) loss 7.2019 (6.4184) grad_norm 2.1356 (inf) loss_scale 128.0000 (173.5523) mem 22339MB +[2024-07-29 12:16:10 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 394): INFO EPOCH 299 training takes 0:05:26 +[2024-07-29 12:16:10 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saving...... +[2024-07-29 12:16:16 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/latest_ckpt.pth saved !!! +[2024-07-29 12:16:17 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 0.591 (0.591) Loss 0.4824 (0.4824) Acc@1 90.674 (90.674) Acc@5 98.975 (98.975) Mem 22339MB +[2024-07-29 12:16:18 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.161) Loss 0.7334 (0.5847) Acc@1 83.154 (88.312) Acc@5 97.363 (98.176) Mem 22339MB +[2024-07-29 12:16:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.140) Loss 0.8037 (0.6699) Acc@1 81.445 (85.740) Acc@5 96.436 (97.435) Mem 22339MB +[2024-07-29 12:16:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.389 Acc@5 97.413 +[2024-07-29 12:16:19 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 257): INFO Accuracy of the network on the 50000 test images: 85.4% +[2024-07-29 12:16:20 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [0/25] Time 1.049 (1.049) Loss 0.4878 (0.4878) Acc@1 90.527 (90.527) Acc@5 99.023 (99.023) Mem 22339MB +[2024-07-29 12:16:22 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [10/25] Time 0.116 (0.204) Loss 0.7349 (0.5901) Acc@1 82.959 (88.308) Acc@5 97.168 (98.162) Mem 22339MB +[2024-07-29 12:16:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 438): INFO Test: [20/25] Time 0.116 (0.162) Loss 0.8086 (0.6745) Acc@1 81.445 (85.738) Acc@5 96.338 (97.431) Mem 22339MB +[2024-07-29 12:16:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 445): INFO * Acc@1 85.377 Acc@5 97.419 +[2024-07-29 12:16:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 272): INFO Accuracy of the network on the 50000 test images: 85.4% +[2024-07-29 12:16:23 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 281): INFO New max accuracy ema: 85.38% +[2024-07-29 12:16:23 vssd_mesa_retrain_base_e300] (utils.py 118): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saving...... +[2024-07-29 12:16:25 vssd_mesa_retrain_base_e300] (utils.py 120): INFO ./exclude/output_mesa/vssd_mesa_retrain_base_e300/20240724223909/best_ckpt_ema.pth saved !!! +[2024-07-29 12:16:25 vssd_mesa_retrain_base_e300] (main_hfai_mnodes.py 291): INFO Training time 0:09:58