diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7934 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 14.999373825923607, + "global_step": 5985, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 5.52933406829834, + "epoch": 0.03, + "learning_rate": 1.669449081803005e-06, + "loss": 5.4118, + "step": 10, + "task_loss": 2.48651123046875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 5.896318435668945, + "epoch": 0.05, + "learning_rate": 3.33889816360601e-06, + "loss": 5.3596, + "step": 20, + "task_loss": 2.43951416015625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 5.618594169616699, + "epoch": 0.08, + "learning_rate": 5.008347245409015e-06, + "loss": 5.2495, + "step": 30, + "task_loss": 2.3388671875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 5.476861953735352, + "epoch": 0.1, + "learning_rate": 6.67779632721202e-06, + "loss": 5.0651, + "step": 40, + "task_loss": 2.18914794921875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 4.707149982452393, + "epoch": 0.13, + "learning_rate": 8.347245409015026e-06, + "loss": 4.8244, + "step": 50, + "task_loss": 2.032379150390625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 4.419103145599365, + "epoch": 0.15, + "learning_rate": 1.001669449081803e-05, + "loss": 4.4922, + "step": 60, + "task_loss": 1.834991455078125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 4.421743392944336, + "epoch": 0.18, + "learning_rate": 1.1686143572621036e-05, + "loss": 4.2271, + "step": 70, + "task_loss": 1.697540283203125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 4.174177169799805, + "epoch": 0.2, + "learning_rate": 1.335559265442404e-05, + "loss": 4.0351, + "step": 80, + "task_loss": 1.718780517578125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 4.20448637008667, + "epoch": 0.23, + "learning_rate": 1.5025041736227046e-05, + "loss": 3.9195, + "step": 90, + "task_loss": 1.6249847412109375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 4.012953281402588, + "epoch": 0.25, + "learning_rate": 1.669449081803005e-05, + "loss": 3.791, + "step": 100, + "task_loss": 1.6454925537109375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.9619874954223633, + "epoch": 0.28, + "learning_rate": 1.8363939899833053e-05, + "loss": 3.6535, + "step": 110, + "task_loss": 1.74847412109375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.734502077102661, + "epoch": 0.3, + "learning_rate": 2.003338898163606e-05, + "loss": 3.5592, + "step": 120, + "task_loss": 1.9617919921875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.4505393505096436, + "epoch": 0.33, + "learning_rate": 2.1702838063439067e-05, + "loss": 3.4397, + "step": 130, + "task_loss": 1.0813446044921875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.4746503829956055, + "epoch": 0.35, + "learning_rate": 2.3372287145242072e-05, + "loss": 3.3614, + "step": 140, + "task_loss": 1.363616943359375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.5675835609436035, + "epoch": 0.38, + "learning_rate": 2.5041736227045077e-05, + "loss": 3.2991, + "step": 150, + "task_loss": 1.799224853515625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.0968575477600098, + "epoch": 0.4, + "learning_rate": 2.671118530884808e-05, + "loss": 3.2155, + "step": 160, + "task_loss": 0.9779510498046875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.098710060119629, + "epoch": 0.43, + "learning_rate": 2.8380634390651084e-05, + "loss": 3.219, + "step": 170, + "task_loss": 1.344268798828125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.267606735229492, + "epoch": 0.45, + "learning_rate": 3.0050083472454093e-05, + "loss": 3.09, + "step": 180, + "task_loss": 1.5805435180664062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.3334648609161377, + "epoch": 0.48, + "learning_rate": 3.1719532554257094e-05, + "loss": 3.0646, + "step": 190, + "task_loss": 1.7886810302734375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.5727791786193848, + "epoch": 0.5, + "learning_rate": 3.33889816360601e-05, + "loss": 3.0265, + "step": 200, + "task_loss": 2.1692123413085938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.188279628753662, + "epoch": 0.53, + "learning_rate": 3.5058430717863105e-05, + "loss": 2.9037, + "step": 210, + "task_loss": 1.5258407592773438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.9483911991119385, + "epoch": 0.55, + "learning_rate": 3.6727879799666106e-05, + "loss": 2.9026, + "step": 220, + "task_loss": 1.400848388671875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.9646759033203125, + "epoch": 0.58, + "learning_rate": 3.839732888146912e-05, + "loss": 2.7934, + "step": 230, + "task_loss": 1.5166206359863281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.761989116668701, + "epoch": 0.6, + "learning_rate": 4.006677796327212e-05, + "loss": 2.85, + "step": 240, + "task_loss": 1.4304275512695312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.454298257827759, + "epoch": 0.63, + "learning_rate": 4.1736227045075125e-05, + "loss": 2.803, + "step": 250, + "task_loss": 1.0836639404296875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.4330437183380127, + "epoch": 0.65, + "learning_rate": 4.3405676126878134e-05, + "loss": 2.877, + "step": 260, + "task_loss": 2.4065017700195312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.8000452518463135, + "epoch": 0.68, + "learning_rate": 4.5075125208681135e-05, + "loss": 2.7497, + "step": 270, + "task_loss": 1.750823974609375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.625284194946289, + "epoch": 0.7, + "learning_rate": 4.6744574290484144e-05, + "loss": 2.5658, + "step": 280, + "task_loss": 1.5195846557617188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.7306337356567383, + "epoch": 0.73, + "learning_rate": 4.8414023372287146e-05, + "loss": 2.6284, + "step": 290, + "task_loss": 1.6767597198486328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.497389793395996, + "epoch": 0.75, + "learning_rate": 5.0083472454090154e-05, + "loss": 2.6684, + "step": 300, + "task_loss": 1.5724601745605469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.830841064453125, + "epoch": 0.78, + "learning_rate": 5.175292153589316e-05, + "loss": 2.5948, + "step": 310, + "task_loss": 1.9135284423828125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.381890296936035, + "epoch": 0.8, + "learning_rate": 5.342237061769616e-05, + "loss": 2.8249, + "step": 320, + "task_loss": 2.4098052978515625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 1.942401647567749, + "epoch": 0.83, + "learning_rate": 5.509181969949917e-05, + "loss": 2.7637, + "step": 330, + "task_loss": 1.1596260070800781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.6451547145843506, + "epoch": 0.85, + "learning_rate": 5.676126878130217e-05, + "loss": 2.7109, + "step": 340, + "task_loss": 2.6936473846435547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.3483595848083496, + "epoch": 0.88, + "learning_rate": 5.8430717863105176e-05, + "loss": 2.6978, + "step": 350, + "task_loss": 2.3931121826171875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.0454773902893066, + "epoch": 0.9, + "learning_rate": 6.0100166944908185e-05, + "loss": 2.7414, + "step": 360, + "task_loss": 2.056713104248047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.457331657409668, + "epoch": 0.93, + "learning_rate": 6.176961602671118e-05, + "loss": 2.6385, + "step": 370, + "task_loss": 1.4607276916503906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.4619457721710205, + "epoch": 0.95, + "learning_rate": 6.343906510851419e-05, + "loss": 2.6759, + "step": 380, + "task_loss": 1.4783172607421875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.8714442253112793, + "epoch": 0.98, + "learning_rate": 6.51085141903172e-05, + "loss": 2.5357, + "step": 390, + "task_loss": 2.118621826171875 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.6209179170344219, + "eval_loss": 2.7820727825164795, + "eval_runtime": 33.3959, + "eval_samples_per_second": 203.558, + "eval_steps_per_second": 3.204, + "step": 399 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.8904120922088623, + "epoch": 1.0, + "learning_rate": 6.67779632721202e-05, + "loss": 2.91, + "step": 400, + "task_loss": 1.6057090759277344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.224745988845825, + "epoch": 1.03, + "learning_rate": 6.844741235392321e-05, + "loss": 2.6201, + "step": 410, + "task_loss": 1.3532609939575195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.6724021434783936, + "epoch": 1.05, + "learning_rate": 7.011686143572621e-05, + "loss": 2.701, + "step": 420, + "task_loss": 1.769845962524414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.517674207687378, + "epoch": 1.08, + "learning_rate": 7.178631051752922e-05, + "loss": 2.8821, + "step": 430, + "task_loss": 2.462800979614258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.8807272911071777, + "epoch": 1.1, + "learning_rate": 7.345575959933221e-05, + "loss": 2.6449, + "step": 440, + "task_loss": 1.9457645416259766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.0602340698242188, + "epoch": 1.13, + "learning_rate": 7.512520868113523e-05, + "loss": 2.7546, + "step": 450, + "task_loss": 2.1220531463623047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.0163254737854004, + "epoch": 1.15, + "learning_rate": 7.679465776293824e-05, + "loss": 2.6791, + "step": 460, + "task_loss": 2.118633270263672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.544952392578125, + "epoch": 1.18, + "learning_rate": 7.846410684474124e-05, + "loss": 2.6155, + "step": 470, + "task_loss": 1.6232109069824219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.0339155197143555, + "epoch": 1.2, + "learning_rate": 8.013355592654425e-05, + "loss": 2.6607, + "step": 480, + "task_loss": 2.111297607421875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.2508902549743652, + "epoch": 1.23, + "learning_rate": 8.180300500834724e-05, + "loss": 2.6619, + "step": 490, + "task_loss": 2.4367713928222656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 1.7746219635009766, + "epoch": 1.25, + "learning_rate": 8.347245409015025e-05, + "loss": 2.6703, + "step": 500, + "task_loss": 0.8428516387939453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.2565126419067383, + "epoch": 1.28, + "learning_rate": 8.514190317195326e-05, + "loss": 2.7742, + "step": 510, + "task_loss": 1.3115043640136719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.6557955741882324, + "epoch": 1.3, + "learning_rate": 8.681135225375627e-05, + "loss": 2.6124, + "step": 520, + "task_loss": 1.7903881072998047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 1.730010747909546, + "epoch": 1.33, + "learning_rate": 8.848080133555928e-05, + "loss": 2.6159, + "step": 530, + "task_loss": 0.8504581451416016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.865082263946533, + "epoch": 1.35, + "learning_rate": 9.015025041736227e-05, + "loss": 2.6776, + "step": 540, + "task_loss": 1.9843921661376953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.7645061016082764, + "epoch": 1.38, + "learning_rate": 9.181969949916528e-05, + "loss": 2.6593, + "step": 550, + "task_loss": 1.812708854675293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.6839334964752197, + "epoch": 1.4, + "learning_rate": 9.348914858096829e-05, + "loss": 2.6528, + "step": 560, + "task_loss": 2.1052331924438477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 1.8773679733276367, + "epoch": 1.43, + "learning_rate": 9.515859766277128e-05, + "loss": 2.6353, + "step": 570, + "task_loss": 1.020782470703125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.2050042152404785, + "epoch": 1.45, + "learning_rate": 9.682804674457429e-05, + "loss": 2.6025, + "step": 580, + "task_loss": 1.3311767578125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.023773193359375, + "epoch": 1.48, + "learning_rate": 9.84974958263773e-05, + "loss": 2.7206, + "step": 590, + "task_loss": 2.2621097564697266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.1227853298187256, + "epoch": 1.5, + "learning_rate": 9.998143334571111e-05, + "loss": 2.7327, + "step": 600, + "task_loss": 2.21990966796875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.0809850692749023, + "epoch": 1.53, + "learning_rate": 9.979576680282213e-05, + "loss": 2.583, + "step": 610, + "task_loss": 1.1689529418945312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.851938247680664, + "epoch": 1.55, + "learning_rate": 9.961010025993317e-05, + "loss": 2.6182, + "step": 620, + "task_loss": 1.9799509048461914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.867166757583618, + "epoch": 1.58, + "learning_rate": 9.94244337170442e-05, + "loss": 2.6952, + "step": 630, + "task_loss": 1.9856109619140625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.0813262462615967, + "epoch": 1.6, + "learning_rate": 9.923876717415522e-05, + "loss": 2.6678, + "step": 640, + "task_loss": 1.1592636108398438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.2972609996795654, + "epoch": 1.63, + "learning_rate": 9.905310063126625e-05, + "loss": 2.6292, + "step": 650, + "task_loss": 2.4136486053466797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.888683557510376, + "epoch": 1.65, + "learning_rate": 9.886743408837728e-05, + "loss": 2.6427, + "step": 660, + "task_loss": 1.9854507446289062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.395509719848633, + "epoch": 1.68, + "learning_rate": 9.868176754548831e-05, + "loss": 2.7457, + "step": 670, + "task_loss": 1.5147476196289062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.6813881397247314, + "epoch": 1.7, + "learning_rate": 9.849610100259934e-05, + "loss": 2.6505, + "step": 680, + "task_loss": 1.7773704528808594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.94386625289917, + "epoch": 1.73, + "learning_rate": 9.831043445971037e-05, + "loss": 2.7619, + "step": 690, + "task_loss": 1.957254409790039 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.845458984375, + "epoch": 1.75, + "learning_rate": 9.81247679168214e-05, + "loss": 2.6894, + "step": 700, + "task_loss": 1.896402359008789 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.2715649604797363, + "epoch": 1.78, + "learning_rate": 9.793910137393241e-05, + "loss": 2.6132, + "step": 710, + "task_loss": 2.270702362060547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.212202787399292, + "epoch": 1.8, + "learning_rate": 9.775343483104346e-05, + "loss": 2.7309, + "step": 720, + "task_loss": 1.4788494110107422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.5183448791503906, + "epoch": 1.83, + "learning_rate": 9.756776828815449e-05, + "loss": 2.6176, + "step": 730, + "task_loss": 1.6445655822753906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.0339484214782715, + "epoch": 1.85, + "learning_rate": 9.73821017452655e-05, + "loss": 2.6213, + "step": 740, + "task_loss": 2.5591773986816406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.872929334640503, + "epoch": 1.88, + "learning_rate": 9.719643520237653e-05, + "loss": 2.7547, + "step": 750, + "task_loss": 1.9304561614990234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.0455856323242188, + "epoch": 1.9, + "learning_rate": 9.701076865948756e-05, + "loss": 2.5792, + "step": 760, + "task_loss": 1.3380451202392578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.4985716342926025, + "epoch": 1.93, + "learning_rate": 9.682510211659859e-05, + "loss": 2.6577, + "step": 770, + "task_loss": 1.663651466369629 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.886143207550049, + "epoch": 1.95, + "learning_rate": 9.663943557370962e-05, + "loss": 2.7676, + "step": 780, + "task_loss": 1.9444026947021484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.7342584133148193, + "epoch": 1.98, + "learning_rate": 9.645376903082065e-05, + "loss": 2.7107, + "step": 790, + "task_loss": 1.7564506530761719 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.6209179170344219, + "eval_loss": 2.733137845993042, + "eval_runtime": 33.2036, + "eval_samples_per_second": 204.737, + "eval_steps_per_second": 3.223, + "step": 798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.581892728805542, + "epoch": 2.01, + "learning_rate": 9.626810248793168e-05, + "loss": 2.6904, + "step": 800, + "task_loss": 1.6179771423339844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.5724754333496094, + "epoch": 2.03, + "learning_rate": 9.60824359450427e-05, + "loss": 2.6259, + "step": 810, + "task_loss": 1.633657455444336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.470709800720215, + "epoch": 2.06, + "learning_rate": 9.589676940215374e-05, + "loss": 2.6808, + "step": 820, + "task_loss": 1.5020484924316406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.9177119731903076, + "epoch": 2.08, + "learning_rate": 9.571110285926477e-05, + "loss": 2.6931, + "step": 830, + "task_loss": 1.9362449645996094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.610812187194824, + "epoch": 2.11, + "learning_rate": 9.552543631637579e-05, + "loss": 2.7249, + "step": 840, + "task_loss": 1.7861461639404297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.928806781768799, + "epoch": 2.13, + "learning_rate": 9.533976977348682e-05, + "loss": 2.6078, + "step": 850, + "task_loss": 2.1046905517578125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.5710256099700928, + "epoch": 2.16, + "learning_rate": 9.515410323059785e-05, + "loss": 2.7071, + "step": 860, + "task_loss": 1.6156253814697266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.940216064453125, + "epoch": 2.18, + "learning_rate": 9.496843668770888e-05, + "loss": 2.6636, + "step": 870, + "task_loss": 2.094156265258789 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.4329566955566406, + "epoch": 2.21, + "learning_rate": 9.478277014481991e-05, + "loss": 2.7426, + "step": 880, + "task_loss": 1.4732303619384766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.444228172302246, + "epoch": 2.23, + "learning_rate": 9.459710360193094e-05, + "loss": 2.6332, + "step": 890, + "task_loss": 1.6369857788085938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.125012159347534, + "epoch": 2.26, + "learning_rate": 9.441143705904197e-05, + "loss": 2.7105, + "step": 900, + "task_loss": 2.2773313522338867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.564040184020996, + "epoch": 2.28, + "learning_rate": 9.422577051615298e-05, + "loss": 2.6147, + "step": 910, + "task_loss": 1.6573524475097656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.1073150634765625, + "epoch": 2.31, + "learning_rate": 9.404010397326403e-05, + "loss": 2.6888, + "step": 920, + "task_loss": 1.4619426727294922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.369579315185547, + "epoch": 2.33, + "learning_rate": 9.385443743037506e-05, + "loss": 2.6488, + "step": 930, + "task_loss": 1.4727344512939453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.495575428009033, + "epoch": 2.36, + "learning_rate": 9.366877088748607e-05, + "loss": 2.6042, + "step": 940, + "task_loss": 2.6389455795288086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.1702356338500977, + "epoch": 2.38, + "learning_rate": 9.34831043445971e-05, + "loss": 2.6761, + "step": 950, + "task_loss": 2.283937454223633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.710407257080078, + "epoch": 2.41, + "learning_rate": 9.329743780170813e-05, + "loss": 2.7026, + "step": 960, + "task_loss": 1.8128604888916016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.4848060607910156, + "epoch": 2.43, + "learning_rate": 9.311177125881916e-05, + "loss": 2.7392, + "step": 970, + "task_loss": 1.635202407836914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.452624797821045, + "epoch": 2.46, + "learning_rate": 9.292610471593019e-05, + "loss": 2.6791, + "step": 980, + "task_loss": 1.4654407501220703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.5394110679626465, + "epoch": 2.48, + "learning_rate": 9.274043817304122e-05, + "loss": 2.6464, + "step": 990, + "task_loss": 1.6244525909423828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.6164135932922363, + "epoch": 2.51, + "learning_rate": 9.255477163015225e-05, + "loss": 2.5907, + "step": 1000, + "task_loss": 1.8127546310424805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.882533073425293, + "epoch": 2.53, + "learning_rate": 9.236910508726327e-05, + "loss": 2.7257, + "step": 1010, + "task_loss": 1.9839677810668945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.2474217414855957, + "epoch": 2.56, + "learning_rate": 9.218343854437431e-05, + "loss": 2.5641, + "step": 1020, + "task_loss": 1.3350105285644531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.0137295722961426, + "epoch": 2.58, + "learning_rate": 9.199777200148534e-05, + "loss": 2.744, + "step": 1030, + "task_loss": 2.1058349609375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.4671058654785156, + "epoch": 2.61, + "learning_rate": 9.181210545859636e-05, + "loss": 2.6377, + "step": 1040, + "task_loss": 1.6492595672607422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.015052318572998, + "epoch": 2.63, + "learning_rate": 9.162643891570739e-05, + "loss": 2.7385, + "step": 1050, + "task_loss": 2.105600357055664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.6651811599731445, + "epoch": 2.66, + "learning_rate": 9.144077237281842e-05, + "loss": 2.6828, + "step": 1060, + "task_loss": 1.9311504364013672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.086965560913086, + "epoch": 2.68, + "learning_rate": 9.125510582992945e-05, + "loss": 2.6166, + "step": 1070, + "task_loss": 1.159210205078125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.3544559478759766, + "epoch": 2.71, + "learning_rate": 9.106943928704048e-05, + "loss": 2.6123, + "step": 1080, + "task_loss": 2.4316577911376953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.5101733207702637, + "epoch": 2.73, + "learning_rate": 9.088377274415151e-05, + "loss": 2.7655, + "step": 1090, + "task_loss": 1.6256141662597656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.988664150238037, + "epoch": 2.76, + "learning_rate": 9.069810620126254e-05, + "loss": 2.7445, + "step": 1100, + "task_loss": 2.0682754516601562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.2268319129943848, + "epoch": 2.78, + "learning_rate": 9.051243965837355e-05, + "loss": 2.7991, + "step": 1110, + "task_loss": 2.38372802734375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.6825551986694336, + "epoch": 2.81, + "learning_rate": 9.03267731154846e-05, + "loss": 2.6202, + "step": 1120, + "task_loss": 1.9193954467773438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.794699192047119, + "epoch": 2.83, + "learning_rate": 9.014110657259563e-05, + "loss": 2.6386, + "step": 1130, + "task_loss": 1.9378623962402344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.007878541946411, + "epoch": 2.86, + "learning_rate": 8.995544002970664e-05, + "loss": 2.709, + "step": 1140, + "task_loss": 2.1123504638671875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 1.961939811706543, + "epoch": 2.88, + "learning_rate": 8.976977348681767e-05, + "loss": 2.5682, + "step": 1150, + "task_loss": 1.0176010131835938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.2347028255462646, + "epoch": 2.91, + "learning_rate": 8.95841069439287e-05, + "loss": 2.6111, + "step": 1160, + "task_loss": 1.3358421325683594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.8768200874328613, + "epoch": 2.93, + "learning_rate": 8.939844040103975e-05, + "loss": 2.6576, + "step": 1170, + "task_loss": 2.9530563354492188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 3.2214670181274414, + "epoch": 2.96, + "learning_rate": 8.921277385815076e-05, + "loss": 2.7044, + "step": 1180, + "task_loss": 2.277057647705078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 3.172197675286656e-08, + "compression_loss": 0.0, + "distillation_loss": 2.128249406814575, + "epoch": 2.98, + "learning_rate": 8.902710731526179e-05, + "loss": 2.671, + "step": 1190, + "task_loss": 1.3369560241699219 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.6209179170344219, + "eval_loss": 2.7330424785614014, + "eval_runtime": 33.3446, + "eval_samples_per_second": 203.871, + "eval_steps_per_second": 3.209, + "step": 1197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.00029970009999999995, + "compression/movement_sparsity/importance_threshold": -0.002706194319112004, + "compression/movement_sparsity/linear_layer_sparsity": 0.0010277989498644986, + "compression/movement_sparsity/model_sparsity": 0.0009239977388574971, + "compression_loss": 0.08153120428323746, + "distillation_loss": 2.5982580184936523, + "epoch": 3.01, + "learning_rate": 8.884144077237282e-05, + "loss": 2.8166, + "step": 1200, + "task_loss": 1.6295490264892578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0017892216000000016, + "compression/movement_sparsity/importance_threshold": -0.002665763802598631, + "compression/movement_sparsity/linear_layer_sparsity": 0.0037766768292682926, + "compression/movement_sparsity/model_sparsity": 0.0033951714498825546, + "compression_loss": 0.48660892248153687, + "distillation_loss": 2.697866916656494, + "epoch": 3.03, + "learning_rate": 8.865577422948385e-05, + "loss": 3.016, + "step": 1210, + "task_loss": 1.6323223114013672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.003263833100000002, + "compression/movement_sparsity/importance_threshold": -0.002625737992563959, + "compression/movement_sparsity/linear_layer_sparsity": 0.09110705312782294, + "compression/movement_sparsity/model_sparsity": 0.08190303522217966, + "compression_loss": 0.8870441913604736, + "distillation_loss": 2.3551669120788574, + "epoch": 3.06, + "learning_rate": 8.847010768659488e-05, + "loss": 3.3288, + "step": 1220, + "task_loss": 1.4615478515625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.004723609600000001, + "compression/movement_sparsity/importance_threshold": -0.0025861148532611133, + "compression/movement_sparsity/linear_layer_sparsity": 0.180595982478922, + "compression/movement_sparsity/model_sparsity": 0.16235138518241088, + "compression_loss": 1.2825900316238403, + "distillation_loss": 2.554757595062256, + "epoch": 3.08, + "learning_rate": 8.828444114370591e-05, + "loss": 3.8362, + "step": 1230, + "task_loss": 1.930368423461914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.006168626100000008, + "compression/movement_sparsity/importance_threshold": -0.0025468923489432188, + "compression/movement_sparsity/linear_layer_sparsity": 0.2533686789558868, + "compression/movement_sparsity/model_sparsity": 0.22777225227938264, + "compression_loss": 1.6731985807418823, + "distillation_loss": 2.6167359352111816, + "epoch": 3.11, + "learning_rate": 8.809877460081694e-05, + "loss": 4.3183, + "step": 1240, + "task_loss": 1.9159774780273438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.007598957600000011, + "compression/movement_sparsity/importance_threshold": -0.002508068443863401, + "compression/movement_sparsity/linear_layer_sparsity": 0.3353249562443541, + "compression/movement_sparsity/model_sparsity": 0.30144893301016396, + "compression_loss": 2.0588760375976562, + "distillation_loss": 2.5464272499084473, + "epoch": 3.13, + "learning_rate": 8.791310805792796e-05, + "loss": 4.5268, + "step": 1250, + "task_loss": 1.7708911895751953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.009014679100000011, + "compression/movement_sparsity/importance_threshold": -0.0024696411022747853, + "compression/movement_sparsity/linear_layer_sparsity": 0.42709878895663955, + "compression/movement_sparsity/model_sparsity": 0.383951354983089, + "compression_loss": 2.439720869064331, + "distillation_loss": 3.011214256286621, + "epoch": 3.16, + "learning_rate": 8.7727441515039e-05, + "loss": 5.0573, + "step": 1260, + "task_loss": 2.087390899658203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.010415865600000008, + "compression/movement_sparsity/importance_threshold": -0.002431608288430497, + "compression/movement_sparsity/linear_layer_sparsity": 0.5258351211984342, + "compression/movement_sparsity/model_sparsity": 0.4727128930590834, + "compression_loss": 2.8156750202178955, + "distillation_loss": 3.1555676460266113, + "epoch": 3.18, + "learning_rate": 8.754177497215003e-05, + "loss": 5.2667, + "step": 1270, + "task_loss": 2.250608444213867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.011802592100000009, + "compression/movement_sparsity/importance_threshold": -0.0023939679665836607, + "compression/movement_sparsity/linear_layer_sparsity": 0.6159454983438724, + "compression/movement_sparsity/model_sparsity": 0.5537199093259016, + "compression_loss": 3.186840057373047, + "distillation_loss": 3.0633091926574707, + "epoch": 3.21, + "learning_rate": 8.735610842926105e-05, + "loss": 5.6874, + "step": 1280, + "task_loss": 2.2352561950683594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.013174933600000016, + "compression/movement_sparsity/importance_threshold": -0.0023567181009874023, + "compression/movement_sparsity/linear_layer_sparsity": 0.6933998019233665, + "compression/movement_sparsity/model_sparsity": 0.6233494262446064, + "compression_loss": 3.553344249725342, + "distillation_loss": 2.3720521926879883, + "epoch": 3.23, + "learning_rate": 8.717044188637208e-05, + "loss": 5.8825, + "step": 1290, + "task_loss": 1.3716545104980469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.014532965100000017, + "compression/movement_sparsity/importance_threshold": -0.002319856655894847, + "compression/movement_sparsity/linear_layer_sparsity": 0.7389964313271605, + "compression/movement_sparsity/model_sparsity": 0.6643396763893116, + "compression_loss": 3.9163827896118164, + "distillation_loss": 2.989243507385254, + "epoch": 3.26, + "learning_rate": 8.698477534348311e-05, + "loss": 6.4051, + "step": 1300, + "task_loss": 2.0763330459594727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01587676160000001, + "compression/movement_sparsity/importance_threshold": -0.00228338159555912, + "compression/movement_sparsity/linear_layer_sparsity": 0.7709859840785908, + "compression/movement_sparsity/model_sparsity": 0.693097498764429, + "compression_loss": 4.275637149810791, + "distillation_loss": 2.6294326782226562, + "epoch": 3.28, + "learning_rate": 8.679910880059414e-05, + "loss": 6.688, + "step": 1310, + "task_loss": 1.8505735397338867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01720639809999999, + "compression/movement_sparsity/importance_threshold": -0.002247290884233347, + "compression/movement_sparsity/linear_layer_sparsity": 0.7967849362955435, + "compression/movement_sparsity/model_sparsity": 0.7162901232779461, + "compression_loss": 4.6308183670043945, + "distillation_loss": 2.626801013946533, + "epoch": 3.31, + "learning_rate": 8.661344225770517e-05, + "loss": 6.9657, + "step": 1320, + "task_loss": 1.6896142959594727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01852194960000002, + "compression/movement_sparsity/importance_threshold": -0.0022115824861706507, + "compression/movement_sparsity/linear_layer_sparsity": 0.8185987019158386, + "compression/movement_sparsity/model_sparsity": 0.7359001629029246, + "compression_loss": 4.981749534606934, + "distillation_loss": 2.623103141784668, + "epoch": 3.33, + "learning_rate": 8.64277757148162e-05, + "loss": 7.2296, + "step": 1330, + "task_loss": 1.6925630569458008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.019823491099999993, + "compression/movement_sparsity/importance_threshold": -0.0021762543656241606, + "compression/movement_sparsity/linear_layer_sparsity": 0.8377232243676603, + "compression/movement_sparsity/model_sparsity": 0.7530926389575905, + "compression_loss": 5.328485012054443, + "distillation_loss": 2.898040533065796, + "epoch": 3.36, + "learning_rate": 8.624210917192723e-05, + "loss": 7.6878, + "step": 1340, + "task_loss": 1.9257926940917969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.021111097599999986, + "compression/movement_sparsity/importance_threshold": -0.0021413044868469986, + "compression/movement_sparsity/linear_layer_sparsity": 0.8541145950956037, + "compression/movement_sparsity/model_sparsity": 0.7678280787288708, + "compression_loss": 5.670895576477051, + "distillation_loss": 2.477210283279419, + "epoch": 3.38, + "learning_rate": 8.605644262903824e-05, + "loss": 7.9455, + "step": 1350, + "task_loss": 1.5733938217163086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.022384844099999992, + "compression/movement_sparsity/importance_threshold": -0.002106730814092291, + "compression/movement_sparsity/linear_layer_sparsity": 0.8677667212436013, + "compression/movement_sparsity/model_sparsity": 0.7801010048887795, + "compression_loss": 6.009064197540283, + "distillation_loss": 1.6416271924972534, + "epoch": 3.41, + "learning_rate": 8.587077608614929e-05, + "loss": 8.2746, + "step": 1360, + "task_loss": 0.8507061004638672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0236448056, + "compression/movement_sparsity/importance_threshold": -0.002072531311613163, + "compression/movement_sparsity/linear_layer_sparsity": 0.8817455562142427, + "compression/movement_sparsity/model_sparsity": 0.7926676342574509, + "compression_loss": 6.342341423034668, + "distillation_loss": 2.047217845916748, + "epoch": 3.43, + "learning_rate": 8.568510954326032e-05, + "loss": 8.5326, + "step": 1370, + "task_loss": 1.360997200012207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.024891057099999992, + "compression/movement_sparsity/importance_threshold": -0.00203870394366274, + "compression/movement_sparsity/linear_layer_sparsity": 0.8926971238896416, + "compression/movement_sparsity/model_sparsity": 0.8025128246665015, + "compression_loss": 6.671533107757568, + "distillation_loss": 2.5189008712768555, + "epoch": 3.46, + "learning_rate": 8.549944300037133e-05, + "loss": 8.8876, + "step": 1380, + "task_loss": 1.7200055122375488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.026123673599999998, + "compression/movement_sparsity/importance_threshold": -0.0020052466744941476, + "compression/movement_sparsity/linear_layer_sparsity": 0.9013934996988858, + "compression/movement_sparsity/model_sparsity": 0.8103306529672842, + "compression_loss": 6.9974260330200195, + "distillation_loss": 2.345778465270996, + "epoch": 3.48, + "learning_rate": 8.531377645748236e-05, + "loss": 9.0697, + "step": 1390, + "task_loss": 1.4693355560302734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02734273009999999, + "compression/movement_sparsity/importance_threshold": -0.00197215746836051, + "compression/movement_sparsity/linear_layer_sparsity": 0.9077951741757001, + "compression/movement_sparsity/model_sparsity": 0.8160856011198281, + "compression_loss": 7.319570064544678, + "distillation_loss": 2.137160301208496, + "epoch": 3.51, + "learning_rate": 8.512810991459339e-05, + "loss": 9.4752, + "step": 1400, + "task_loss": 1.569617748260498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.028548301599999992, + "compression/movement_sparsity/importance_threshold": -0.0019394342895149535, + "compression/movement_sparsity/linear_layer_sparsity": 0.915045555273261, + "compression/movement_sparsity/model_sparsity": 0.8226035156832395, + "compression_loss": 7.63688850402832, + "distillation_loss": 2.0281312465667725, + "epoch": 3.53, + "learning_rate": 8.494244337170442e-05, + "loss": 9.6117, + "step": 1410, + "task_loss": 1.2063002586364746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.029740463099999993, + "compression/movement_sparsity/importance_threshold": -0.0019070751022106027, + "compression/movement_sparsity/linear_layer_sparsity": 0.9220578327311051, + "compression/movement_sparsity/model_sparsity": 0.828907380921515, + "compression_loss": 7.949488162994385, + "distillation_loss": 1.964815378189087, + "epoch": 3.56, + "learning_rate": 8.475677682881545e-05, + "loss": 10.0276, + "step": 1420, + "task_loss": 1.3674798011779785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0309192896, + "compression/movement_sparsity/importance_threshold": -0.001875077870700583, + "compression/movement_sparsity/linear_layer_sparsity": 0.9270990006775067, + "compression/movement_sparsity/model_sparsity": 0.8334392671123675, + "compression_loss": 8.258493423461914, + "distillation_loss": 1.8291049003601074, + "epoch": 3.58, + "learning_rate": 8.457111028592648e-05, + "loss": 10.2532, + "step": 1430, + "task_loss": 1.1121773719787598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0320848561, + "compression/movement_sparsity/importance_threshold": -0.0018434405592380196, + "compression/movement_sparsity/linear_layer_sparsity": 0.9316989423366456, + "compression/movement_sparsity/model_sparsity": 0.8375745017059022, + "compression_loss": 8.56396484375, + "distillation_loss": 1.9980548620224, + "epoch": 3.61, + "learning_rate": 8.438544374303751e-05, + "loss": 10.5547, + "step": 1440, + "task_loss": 1.2044320106506348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.033237237600000004, + "compression/movement_sparsity/importance_threshold": -0.001812161132076038, + "compression/movement_sparsity/linear_layer_sparsity": 0.9352631455133996, + "compression/movement_sparsity/model_sparsity": 0.8407786328377868, + "compression_loss": 8.865992546081543, + "distillation_loss": 1.910906434059143, + "epoch": 3.63, + "learning_rate": 8.419977720014853e-05, + "loss": 10.686, + "step": 1450, + "task_loss": 1.246495246887207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0343765091, + "compression/movement_sparsity/importance_threshold": -0.0017812375534677636, + "compression/movement_sparsity/linear_layer_sparsity": 0.9391021388512496, + "compression/movement_sparsity/model_sparsity": 0.8442297935766381, + "compression_loss": 9.163341522216797, + "distillation_loss": 1.956077218055725, + "epoch": 3.66, + "learning_rate": 8.401411065725957e-05, + "loss": 11.0227, + "step": 1460, + "task_loss": 0.9735760688781738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03550274560000001, + "compression/movement_sparsity/importance_threshold": -0.0017506677876663206, + "compression/movement_sparsity/linear_layer_sparsity": 0.9422174937895212, + "compression/movement_sparsity/model_sparsity": 0.8470304211642261, + "compression_loss": 9.45726203918457, + "distillation_loss": 1.7877765893936157, + "epoch": 3.68, + "learning_rate": 8.38284441143706e-05, + "loss": 11.3054, + "step": 1470, + "task_loss": 0.8829164505004883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0366160221, + "compression/movement_sparsity/importance_threshold": -0.0017204497989248356, + "compression/movement_sparsity/linear_layer_sparsity": 0.944639862804878, + "compression/movement_sparsity/model_sparsity": 0.849208071424357, + "compression_loss": 9.746960639953613, + "distillation_loss": 2.6204113960266113, + "epoch": 3.71, + "learning_rate": 8.364277757148162e-05, + "loss": 11.5935, + "step": 1480, + "task_loss": 1.7807934284210205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03771641360000001, + "compression/movement_sparsity/importance_threshold": -0.0016905815514964328, + "compression/movement_sparsity/linear_layer_sparsity": 0.9472912902740138, + "compression/movement_sparsity/model_sparsity": 0.8515916396095828, + "compression_loss": 10.032690048217773, + "distillation_loss": 2.0600037574768066, + "epoch": 3.73, + "learning_rate": 8.345711102859265e-05, + "loss": 11.7532, + "step": 1490, + "task_loss": 1.5130555629730225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.038803995100000016, + "compression/movement_sparsity/importance_threshold": -0.0016610610096342382, + "compression/movement_sparsity/linear_layer_sparsity": 0.9496682216388136, + "compression/movement_sparsity/model_sparsity": 0.8537284425376481, + "compression_loss": 10.314544677734375, + "distillation_loss": 1.842912197113037, + "epoch": 3.76, + "learning_rate": 8.327144448570368e-05, + "loss": 11.966, + "step": 1500, + "task_loss": 1.2120623588562012 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039878841600000006, + "compression/movement_sparsity/importance_threshold": -0.0016318861375913768, + "compression/movement_sparsity/linear_layer_sparsity": 0.9515408927092743, + "compression/movement_sparsity/model_sparsity": 0.8554119278439228, + "compression_loss": 10.59346866607666, + "distillation_loss": 2.060537815093994, + "epoch": 3.78, + "learning_rate": 8.308577794281471e-05, + "loss": 12.2737, + "step": 1510, + "task_loss": 1.4073264598846436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04094102810000001, + "compression/movement_sparsity/importance_threshold": -0.0016030548996209734, + "compression/movement_sparsity/linear_layer_sparsity": 0.9531386442336646, + "compression/movement_sparsity/model_sparsity": 0.8568482672293158, + "compression_loss": 10.868535041809082, + "distillation_loss": 1.7969244718551636, + "epoch": 3.81, + "learning_rate": 8.290011139992574e-05, + "loss": 12.5202, + "step": 1520, + "task_loss": 1.4390745162963867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04199062959999999, + "compression/movement_sparsity/importance_threshold": -0.0015745652599761545, + "compression/movement_sparsity/linear_layer_sparsity": 0.9549563031654622, + "compression/movement_sparsity/model_sparsity": 0.8584822979738328, + "compression_loss": 11.140108108520508, + "distillation_loss": 1.244471549987793, + "epoch": 3.83, + "learning_rate": 8.271444485703677e-05, + "loss": 12.7825, + "step": 1530, + "task_loss": 0.7227206230163574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.043027721100000006, + "compression/movement_sparsity/importance_threshold": -0.0015464151829100434, + "compression/movement_sparsity/linear_layer_sparsity": 0.9562602802243301, + "compression/movement_sparsity/model_sparsity": 0.859654541328766, + "compression_loss": 11.407764434814453, + "distillation_loss": 1.4010082483291626, + "epoch": 3.86, + "learning_rate": 8.25287783141478e-05, + "loss": 12.948, + "step": 1540, + "task_loss": 0.951411247253418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.044052377599999984, + "compression/movement_sparsity/importance_threshold": -0.0015186026326757674, + "compression/movement_sparsity/linear_layer_sparsity": 0.9577688737390846, + "compression/movement_sparsity/model_sparsity": 0.8610107298528967, + "compression_loss": 11.670872688293457, + "distillation_loss": 1.8429977893829346, + "epoch": 3.88, + "learning_rate": 8.234311177125881e-05, + "loss": 13.3417, + "step": 1550, + "task_loss": 1.2477965354919434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.045064674100000014, + "compression/movement_sparsity/importance_threshold": -0.0014911255735264495, + "compression/movement_sparsity/linear_layer_sparsity": 0.9588648115213791, + "compression/movement_sparsity/model_sparsity": 0.8619959510068873, + "compression_loss": 11.931397438049316, + "distillation_loss": 1.6905767917633057, + "epoch": 3.91, + "learning_rate": 8.215744522836986e-05, + "loss": 13.5459, + "step": 1560, + "task_loss": 1.1686151027679443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04606468559999999, + "compression/movement_sparsity/importance_threshold": -0.0014639819697152175, + "compression/movement_sparsity/linear_layer_sparsity": 0.9599089953139115, + "compression/movement_sparsity/model_sparsity": 0.8629346465949735, + "compression_loss": 12.188403129577637, + "distillation_loss": 1.89365816116333, + "epoch": 3.93, + "learning_rate": 8.197177868548089e-05, + "loss": 13.7823, + "step": 1570, + "task_loss": 1.514646291732788 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04705248710000001, + "compression/movement_sparsity/importance_threshold": -0.0014371697854951941, + "compression/movement_sparsity/linear_layer_sparsity": 0.9608784416403192, + "compression/movement_sparsity/model_sparsity": 0.8638061550362973, + "compression_loss": 12.442070007324219, + "distillation_loss": 2.256486415863037, + "epoch": 3.96, + "learning_rate": 8.17861121425919e-05, + "loss": 13.9808, + "step": 1580, + "task_loss": 1.5805597305297852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04802815359999999, + "compression/movement_sparsity/importance_threshold": -0.001410686985119507, + "compression/movement_sparsity/linear_layer_sparsity": 0.9616035385614273, + "compression/movement_sparsity/model_sparsity": 0.8644579993625997, + "compression_loss": 12.692581176757812, + "distillation_loss": 2.099273443222046, + "epoch": 3.98, + "learning_rate": 8.160044559970293e-05, + "loss": 14.208, + "step": 1590, + "task_loss": 1.352173089981079 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.7138864371874081, + "eval_loss": 14.265999794006348, + "eval_runtime": 33.0781, + "eval_samples_per_second": 205.513, + "eval_steps_per_second": 3.235, + "step": 1596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0490874603375, + "compression/movement_sparsity/importance_threshold": -0.0013819339133825624, + "compression/movement_sparsity/linear_layer_sparsity": 0.9624858852755194, + "compression/movement_sparsity/model_sparsity": 0.8652512073913051, + "compression_loss": 12.964597702026367, + "distillation_loss": 1.4180299043655396, + "epoch": 4.01, + "learning_rate": 8.141477905681396e-05, + "loss": 14.7271, + "step": 1600, + "task_loss": 0.6856341361999512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0500378874625, + "compression/movement_sparsity/importance_threshold": -0.001356136192723901, + "compression/movement_sparsity/linear_layer_sparsity": 0.9630990524314965, + "compression/movement_sparsity/model_sparsity": 0.8658024296073474, + "compression_loss": 13.209151268005371, + "distillation_loss": 1.6582415103912354, + "epoch": 4.04, + "learning_rate": 8.1229112513925e-05, + "loss": 14.7, + "step": 1610, + "task_loss": 1.0920584201812744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05097641208750001, + "compression/movement_sparsity/importance_threshold": -0.0013306615450942628, + "compression/movement_sparsity/linear_layer_sparsity": 0.9636999985885275, + "compression/movement_sparsity/model_sparsity": 0.866342665445441, + "compression_loss": 13.450250625610352, + "distillation_loss": 2.3805365562438965, + "epoch": 4.06, + "learning_rate": 8.104344597103602e-05, + "loss": 15.0142, + "step": 1620, + "task_loss": 1.810115098953247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.051903109212500013, + "compression/movement_sparsity/importance_threshold": -0.0013055079347467735, + "compression/movement_sparsity/linear_layer_sparsity": 0.9643645080548028, + "compression/movement_sparsity/model_sparsity": 0.8669400431376588, + "compression_loss": 13.687724113464355, + "distillation_loss": 1.781688928604126, + "epoch": 4.09, + "learning_rate": 8.085777942814705e-05, + "loss": 15.1401, + "step": 1630, + "task_loss": 1.2644379138946533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.052818053837500004, + "compression/movement_sparsity/importance_threshold": -0.0012806733259345584, + "compression/movement_sparsity/linear_layer_sparsity": 0.9648750141147244, + "compression/movement_sparsity/model_sparsity": 0.8673989755493348, + "compression_loss": 13.92128849029541, + "distillation_loss": 1.3930050134658813, + "epoch": 4.11, + "learning_rate": 8.067211288525808e-05, + "loss": 15.2677, + "step": 1640, + "task_loss": 0.8308200836181641 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05372132096250001, + "compression/movement_sparsity/importance_threshold": -0.0012561556829107417, + "compression/movement_sparsity/linear_layer_sparsity": 0.9653256737428485, + "compression/movement_sparsity/model_sparsity": 0.8678041074884378, + "compression_loss": 14.151246070861816, + "distillation_loss": 1.2472010850906372, + "epoch": 4.14, + "learning_rate": 8.04864463423691e-05, + "loss": 15.5456, + "step": 1650, + "task_loss": 0.7744660377502441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05461298558750001, + "compression/movement_sparsity/importance_threshold": -0.0012319529699284498, + "compression/movement_sparsity/linear_layer_sparsity": 0.9656315398223426, + "compression/movement_sparsity/model_sparsity": 0.8680790735829317, + "compression_loss": 14.37841510772705, + "distillation_loss": 1.2477915287017822, + "epoch": 4.16, + "learning_rate": 8.030077979948014e-05, + "loss": 15.7375, + "step": 1660, + "task_loss": 0.906688928604126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05549312271250001, + "compression/movement_sparsity/importance_threshold": -0.0012080631512408073, + "compression/movement_sparsity/linear_layer_sparsity": 0.9660791177356218, + "compression/movement_sparsity/model_sparsity": 0.8684814351360651, + "compression_loss": 14.601975440979004, + "distillation_loss": 1.6081792116165161, + "epoch": 4.19, + "learning_rate": 8.011511325659117e-05, + "loss": 15.9181, + "step": 1670, + "task_loss": 1.0709311962127686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05636180733750001, + "compression/movement_sparsity/importance_threshold": -0.00118448419110094, + "compression/movement_sparsity/linear_layer_sparsity": 0.9665842484379705, + "compression/movement_sparsity/model_sparsity": 0.8689355352332824, + "compression_loss": 14.82110595703125, + "distillation_loss": 1.8337922096252441, + "epoch": 4.21, + "learning_rate": 7.992944671370219e-05, + "loss": 16.1881, + "step": 1680, + "task_loss": 1.3984484672546387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05721911446249999, + "compression/movement_sparsity/importance_threshold": -0.001161214053761973, + "compression/movement_sparsity/linear_layer_sparsity": 0.9671261597598615, + "compression/movement_sparsity/model_sparsity": 0.8694227002042684, + "compression_loss": 15.03636646270752, + "distillation_loss": 1.0711318254470825, + "epoch": 4.24, + "learning_rate": 7.974378017081322e-05, + "loss": 16.3072, + "step": 1690, + "task_loss": 0.5143593549728394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.058065119087500006, + "compression/movement_sparsity/importance_threshold": -0.0011382507034770303, + "compression/movement_sparsity/linear_layer_sparsity": 0.967436001486751, + "compression/movement_sparsity/model_sparsity": 0.869701240308143, + "compression_loss": 15.248674392700195, + "distillation_loss": 1.7697985172271729, + "epoch": 4.26, + "learning_rate": 7.955811362792425e-05, + "loss": 16.6005, + "step": 1700, + "task_loss": 1.1664330959320068 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0588998962125, + "compression/movement_sparsity/importance_threshold": -0.0011155921044992392, + "compression/movement_sparsity/linear_layer_sparsity": 0.9677300347222222, + "compression/movement_sparsity/model_sparsity": 0.8699655689664324, + "compression_loss": 15.458734512329102, + "distillation_loss": 2.0738933086395264, + "epoch": 4.29, + "learning_rate": 7.937244708503528e-05, + "loss": 16.8255, + "step": 1710, + "task_loss": 1.4591351747512817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.059723520837500016, + "compression/movement_sparsity/importance_threshold": -0.0010932362210817228, + "compression/movement_sparsity/linear_layer_sparsity": 0.9680093415951521, + "compression/movement_sparsity/model_sparsity": 0.8702166589864236, + "compression_loss": 15.665163040161133, + "distillation_loss": 1.6533656120300293, + "epoch": 4.31, + "learning_rate": 7.918678054214631e-05, + "loss": 16.9012, + "step": 1720, + "task_loss": 1.0374521017074585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.060536067962499995, + "compression/movement_sparsity/importance_threshold": -0.0010711810174776086, + "compression/movement_sparsity/linear_layer_sparsity": 0.9683388616004216, + "compression/movement_sparsity/model_sparsity": 0.8705128893793341, + "compression_loss": 15.870290756225586, + "distillation_loss": 0.8290092945098877, + "epoch": 4.34, + "learning_rate": 7.901968065354623e-05, + "loss": 17.1133, + "step": 1730, + "task_loss": 0.7305871248245239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06133761258749999, + "compression/movement_sparsity/importance_threshold": -0.0010494244579400205, + "compression/movement_sparsity/linear_layer_sparsity": 0.9686961993751882, + "compression/movement_sparsity/model_sparsity": 0.8708341272639182, + "compression_loss": 16.070955276489258, + "distillation_loss": 1.2938363552093506, + "epoch": 4.36, + "learning_rate": 7.883401411065726e-05, + "loss": 17.4102, + "step": 1740, + "task_loss": 0.883080244064331 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0621282297125, + "compression/movement_sparsity/importance_threshold": -0.0010279645067220837, + "compression/movement_sparsity/linear_layer_sparsity": 0.9689277984793737, + "compression/movement_sparsity/model_sparsity": 0.8710423291713395, + "compression_loss": 16.267589569091797, + "distillation_loss": 0.7971197366714478, + "epoch": 4.39, + "learning_rate": 7.864834756776829e-05, + "loss": 17.5585, + "step": 1750, + "task_loss": 0.5288671255111694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0629079943375, + "compression/movement_sparsity/importance_threshold": -0.0010067991280769237, + "compression/movement_sparsity/linear_layer_sparsity": 0.9690925996499549, + "compression/movement_sparsity/model_sparsity": 0.8711904813767676, + "compression_loss": 16.4605770111084, + "distillation_loss": 1.884833574295044, + "epoch": 4.41, + "learning_rate": 7.846268102487932e-05, + "loss": 17.7748, + "step": 1760, + "task_loss": 1.292289137840271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0636769814625, + "compression/movement_sparsity/importance_threshold": -0.0009859262862576656, + "compression/movement_sparsity/linear_layer_sparsity": 0.9693808811352003, + "compression/movement_sparsity/model_sparsity": 0.8714496393528463, + "compression_loss": 16.65067481994629, + "distillation_loss": 1.5404561758041382, + "epoch": 4.44, + "learning_rate": 7.827701448199035e-05, + "loss": 17.8402, + "step": 1770, + "task_loss": 0.9988787174224854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0644352660875, + "compression/movement_sparsity/importance_threshold": -0.0009653439455174352, + "compression/movement_sparsity/linear_layer_sparsity": 0.9696630580021078, + "compression/movement_sparsity/model_sparsity": 0.8717033094269467, + "compression_loss": 16.837390899658203, + "distillation_loss": 1.2562533617019653, + "epoch": 4.46, + "learning_rate": 7.809134793910138e-05, + "loss": 18.0467, + "step": 1780, + "task_loss": 0.6564147472381592 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06518292321250001, + "compression/movement_sparsity/importance_threshold": -0.0009450500701093571, + "compression/movement_sparsity/linear_layer_sparsity": 0.9697970773110509, + "compression/movement_sparsity/model_sparsity": 0.8718237894946541, + "compression_loss": 17.021377563476562, + "distillation_loss": 1.7418609857559204, + "epoch": 4.49, + "learning_rate": 7.790568139621241e-05, + "loss": 18.2702, + "step": 1790, + "task_loss": 1.3079674243927002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06592002783749999, + "compression/movement_sparsity/importance_threshold": -0.000925042624286557, + "compression/movement_sparsity/linear_layer_sparsity": 0.9699971417682927, + "compression/movement_sparsity/model_sparsity": 0.8720036425288507, + "compression_loss": 17.20431137084961, + "distillation_loss": 0.8406474590301514, + "epoch": 4.51, + "learning_rate": 7.772001485332343e-05, + "loss": 18.2659, + "step": 1800, + "task_loss": 0.4409937858581543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0666466549625, + "compression/movement_sparsity/importance_threshold": -0.0009053195723021598, + "compression/movement_sparsity/linear_layer_sparsity": 0.970332531146492, + "compression/movement_sparsity/model_sparsity": 0.8723051493438944, + "compression_loss": 17.385215759277344, + "distillation_loss": 1.0912799835205078, + "epoch": 4.54, + "learning_rate": 7.753434831043447e-05, + "loss": 18.6708, + "step": 1810, + "task_loss": 0.6726193428039551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06736287958750001, + "compression/movement_sparsity/importance_threshold": -0.0008858788784092907, + "compression/movement_sparsity/linear_layer_sparsity": 0.9704807945648901, + "compression/movement_sparsity/model_sparsity": 0.8724384345162176, + "compression_loss": 17.563297271728516, + "distillation_loss": 2.043074369430542, + "epoch": 4.56, + "learning_rate": 7.73486817675455e-05, + "loss": 18.8081, + "step": 1820, + "task_loss": 1.5735231637954712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0680687767125, + "compression/movement_sparsity/importance_threshold": -0.0008667185068610755, + "compression/movement_sparsity/linear_layer_sparsity": 0.9706986670995182, + "compression/movement_sparsity/model_sparsity": 0.8726342965746821, + "compression_loss": 17.736778259277344, + "distillation_loss": 1.50892972946167, + "epoch": 4.59, + "learning_rate": 7.716301522465652e-05, + "loss": 18.9711, + "step": 1830, + "task_loss": 0.9894187450408936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0687644213375, + "compression/movement_sparsity/importance_threshold": -0.0008478364219106388, + "compression/movement_sparsity/linear_layer_sparsity": 0.9708350623870822, + "compression/movement_sparsity/model_sparsity": 0.8727569125888242, + "compression_loss": 17.906818389892578, + "distillation_loss": 1.3826042413711548, + "epoch": 4.61, + "learning_rate": 7.697734868176755e-05, + "loss": 19.124, + "step": 1840, + "task_loss": 0.9412574768066406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0694498884625, + "compression/movement_sparsity/importance_threshold": -0.0008292305878111065, + "compression/movement_sparsity/linear_layer_sparsity": 0.9710739305743752, + "compression/movement_sparsity/model_sparsity": 0.8729716492234566, + "compression_loss": 18.07468032836914, + "distillation_loss": 0.9980165362358093, + "epoch": 4.64, + "learning_rate": 7.679168213887858e-05, + "loss": 19.1274, + "step": 1850, + "task_loss": 0.6569799184799194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.07012525308749999, + "compression/movement_sparsity/importance_threshold": -0.0008108989688156033, + "compression/movement_sparsity/linear_layer_sparsity": 0.9712370967893706, + "compression/movement_sparsity/model_sparsity": 0.8731183316439618, + "compression_loss": 18.239511489868164, + "distillation_loss": 0.9089177846908569, + "epoch": 4.66, + "learning_rate": 7.66060155959896e-05, + "loss": 19.3729, + "step": 1860, + "task_loss": 0.6694349050521851 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0707905902125, + "compression/movement_sparsity/importance_threshold": -0.0007928395291772544, + "compression/movement_sparsity/linear_layer_sparsity": 0.9713556369504667, + "compression/movement_sparsity/model_sparsity": 0.8732248963378669, + "compression_loss": 18.401731491088867, + "distillation_loss": 0.9277310371398926, + "epoch": 4.69, + "learning_rate": 7.642034905310064e-05, + "loss": 19.5876, + "step": 1870, + "task_loss": 0.5631176233291626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.07144597483750001, + "compression/movement_sparsity/importance_threshold": -0.0007750502331491855, + "compression/movement_sparsity/linear_layer_sparsity": 0.9715034298780488, + "compression/movement_sparsity/model_sparsity": 0.8733577585505002, + "compression_loss": 18.561702728271484, + "distillation_loss": 1.5351662635803223, + "epoch": 4.71, + "learning_rate": 7.623468251021167e-05, + "loss": 19.7659, + "step": 1880, + "task_loss": 0.7325328588485718 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.07209148196250001, + "compression/movement_sparsity/importance_threshold": -0.0007575290449845216, + "compression/movement_sparsity/linear_layer_sparsity": 0.9715637703252032, + "compression/movement_sparsity/model_sparsity": 0.8734120031307476, + "compression_loss": 18.71929168701172, + "distillation_loss": 1.2189714908599854, + "epoch": 4.74, + "learning_rate": 7.60490159673227e-05, + "loss": 19.858, + "step": 1890, + "task_loss": 0.7309414148330688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.07272718658749999, + "compression/movement_sparsity/importance_threshold": -0.0007402739289363884, + "compression/movement_sparsity/linear_layer_sparsity": 0.971693637552695, + "compression/movement_sparsity/model_sparsity": 0.8735287505791904, + "compression_loss": 18.873430252075195, + "distillation_loss": 1.1738145351409912, + "epoch": 4.76, + "learning_rate": 7.586334942443371e-05, + "loss": 20.0053, + "step": 1900, + "task_loss": 0.8075315952301025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0733531637125, + "compression/movement_sparsity/importance_threshold": -0.0007232828492579097, + "compression/movement_sparsity/linear_layer_sparsity": 0.9718444063346884, + "compression/movement_sparsity/model_sparsity": 0.8736642880118631, + "compression_loss": 19.02474594116211, + "distillation_loss": 1.2783410549163818, + "epoch": 4.79, + "learning_rate": 7.567768288154476e-05, + "loss": 20.1099, + "step": 1910, + "task_loss": 0.8135542869567871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.07396948833750001, + "compression/movement_sparsity/importance_threshold": -0.0007065537702022123, + "compression/movement_sparsity/linear_layer_sparsity": 0.9719619231782596, + "compression/movement_sparsity/model_sparsity": 0.8737699327684425, + "compression_loss": 19.174118041992188, + "distillation_loss": 1.0710728168487549, + "epoch": 4.81, + "learning_rate": 7.549201633865579e-05, + "loss": 20.2932, + "step": 1920, + "task_loss": 0.5648860931396484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.07457623546250002, + "compression/movement_sparsity/importance_threshold": -0.0006900846560224209, + "compression/movement_sparsity/linear_layer_sparsity": 0.9720882382000904, + "compression/movement_sparsity/model_sparsity": 0.8738834868712255, + "compression_loss": 19.320566177368164, + "distillation_loss": 0.9400890469551086, + "epoch": 4.84, + "learning_rate": 7.53063497957668e-05, + "loss": 20.4165, + "step": 1930, + "task_loss": 0.5581340789794922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.07517348008750001, + "compression/movement_sparsity/importance_threshold": -0.0006738734709716612, + "compression/movement_sparsity/linear_layer_sparsity": 0.9722053668887384, + "compression/movement_sparsity/model_sparsity": 0.8739887826860605, + "compression_loss": 19.465200424194336, + "distillation_loss": 1.4937341213226318, + "epoch": 4.86, + "learning_rate": 7.512068325287783e-05, + "loss": 20.5839, + "step": 1940, + "task_loss": 1.0040315389633179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.07576129721250001, + "compression/movement_sparsity/importance_threshold": -0.0006579181793030576, + "compression/movement_sparsity/linear_layer_sparsity": 0.9722984064476061, + "compression/movement_sparsity/model_sparsity": 0.8740724229647655, + "compression_loss": 19.606977462768555, + "distillation_loss": 1.4447404146194458, + "epoch": 4.89, + "learning_rate": 7.493501670998886e-05, + "loss": 20.6829, + "step": 1950, + "task_loss": 1.094190001487732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0763397618375, + "compression/movement_sparsity/importance_threshold": -0.0006422167452697358, + "compression/movement_sparsity/linear_layer_sparsity": 0.9724066781466426, + "compression/movement_sparsity/model_sparsity": 0.8741697565634357, + "compression_loss": 19.746036529541016, + "distillation_loss": 1.3600887060165405, + "epoch": 4.91, + "learning_rate": 7.474935016709989e-05, + "loss": 20.8516, + "step": 1960, + "task_loss": 0.8484234809875488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0769089489625, + "compression/movement_sparsity/importance_threshold": -0.0006267671331248205, + "compression/movement_sparsity/linear_layer_sparsity": 0.9725269120746763, + "compression/movement_sparsity/model_sparsity": 0.8742778439122249, + "compression_loss": 19.882564544677734, + "distillation_loss": 0.8116432428359985, + "epoch": 4.94, + "learning_rate": 7.456368362421092e-05, + "loss": 20.9129, + "step": 1970, + "task_loss": 0.4139885902404785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.07746893358750001, + "compression/movement_sparsity/importance_threshold": -0.000611567307121438, + "compression/movement_sparsity/linear_layer_sparsity": 0.972613929351099, + "compression/movement_sparsity/model_sparsity": 0.8743560703068974, + "compression_loss": 20.016332626342773, + "distillation_loss": 1.274233102798462, + "epoch": 4.96, + "learning_rate": 7.437801708132195e-05, + "loss": 21.0475, + "step": 1980, + "task_loss": 0.8784727454185486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0780197907125, + "compression/movement_sparsity/importance_threshold": -0.0005966152315127131, + "compression/movement_sparsity/linear_layer_sparsity": 0.9726826445347787, + "compression/movement_sparsity/model_sparsity": 0.8744178435696275, + "compression_loss": 20.14716911315918, + "distillation_loss": 0.6197555065155029, + "epoch": 4.99, + "learning_rate": 7.419235053843298e-05, + "loss": 21.0916, + "step": 1990, + "task_loss": 0.38870543241500854 + }, + { + "epoch": 5.0, + "eval_accuracy": 0.8103854074727861, + "eval_loss": 21.031455993652344, + "eval_runtime": 33.2082, + "eval_samples_per_second": 204.708, + "eval_steps_per_second": 3.222, + "step": 1995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.07861528079999999, + "compression/movement_sparsity/importance_threshold": -0.00058045167038505, + "compression/movement_sparsity/linear_layer_sparsity": 0.9728296258657031, + "compression/movement_sparsity/model_sparsity": 0.8745499761767954, + "compression_loss": 20.2894287109375, + "distillation_loss": 1.4065853357315063, + "epoch": 5.01, + "learning_rate": 7.4006683995544e-05, + "loss": 21.8014, + "step": 2000, + "task_loss": 1.035426139831543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0791472143, + "compression/movement_sparsity/importance_threshold": -0.0005660132442490279, + "compression/movement_sparsity/linear_layer_sparsity": 0.9728921423328817, + "compression/movement_sparsity/model_sparsity": 0.8746061769456093, + "compression_loss": 20.415363311767578, + "distillation_loss": 1.1601554155349731, + "epoch": 5.04, + "learning_rate": 7.382101745265504e-05, + "loss": 21.3696, + "step": 2010, + "task_loss": 0.6176354289054871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.07967025280000001, + "compression/movement_sparsity/importance_threshold": -0.0005518162576923519, + "compression/movement_sparsity/linear_layer_sparsity": 0.9730359478507979, + "compression/movement_sparsity/model_sparsity": 0.8747354545748695, + "compression_loss": 20.53955841064453, + "distillation_loss": 0.9918860793113708, + "epoch": 5.06, + "learning_rate": 7.363535090976607e-05, + "loss": 21.4975, + "step": 2020, + "task_loss": 0.5870554447174072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0801844713, + "compression/movement_sparsity/importance_threshold": -0.0005378586749681469, + "compression/movement_sparsity/linear_layer_sparsity": 0.9731496066696778, + "compression/movement_sparsity/model_sparsity": 0.8748376310619904, + "compression_loss": 20.661848068237305, + "distillation_loss": 0.9830354452133179, + "epoch": 5.09, + "learning_rate": 7.344968436687709e-05, + "loss": 21.6823, + "step": 2030, + "task_loss": 0.6770889163017273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0806899448, + "compression/movement_sparsity/importance_threshold": -0.0005241384603295383, + "compression/movement_sparsity/linear_layer_sparsity": 0.973217604354863, + "compression/movement_sparsity/model_sparsity": 0.8748987593111932, + "compression_loss": 20.781530380249023, + "distillation_loss": 1.882486343383789, + "epoch": 5.11, + "learning_rate": 7.326401782398812e-05, + "loss": 21.7331, + "step": 2040, + "task_loss": 1.5382585525512695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0811867483, + "compression/movement_sparsity/importance_threshold": -0.0005106535780296513, + "compression/movement_sparsity/linear_layer_sparsity": 0.9733279579757603, + "compression/movement_sparsity/model_sparsity": 0.8749979645064917, + "compression_loss": 20.898618698120117, + "distillation_loss": 0.929511308670044, + "epoch": 5.14, + "learning_rate": 7.307835128109915e-05, + "loss": 21.8626, + "step": 2050, + "task_loss": 0.927236795425415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.08167495680000002, + "compression/movement_sparsity/importance_threshold": -0.00049740199232161, + "compression/movement_sparsity/linear_layer_sparsity": 0.973299516805932, + "compression/movement_sparsity/model_sparsity": 0.8749723965932289, + "compression_loss": 21.01331329345703, + "distillation_loss": 1.36324143409729, + "epoch": 5.16, + "learning_rate": 7.289268473821018e-05, + "loss": 21.9156, + "step": 2060, + "task_loss": 0.9404169917106628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.08215464530000001, + "compression/movement_sparsity/importance_threshold": -0.00048438166745854187, + "compression/movement_sparsity/linear_layer_sparsity": 0.9734310660380909, + "compression/movement_sparsity/model_sparsity": 0.8750906561225635, + "compression_loss": 21.126338958740234, + "distillation_loss": 0.800617516040802, + "epoch": 5.19, + "learning_rate": 7.27070181953212e-05, + "loss": 22.0209, + "step": 2070, + "task_loss": 0.35712623596191406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.08262588879999999, + "compression/movement_sparsity/importance_threshold": -0.000471590567693571, + "compression/movement_sparsity/linear_layer_sparsity": 0.9735261522320084, + "compression/movement_sparsity/model_sparsity": 0.8751761362759203, + "compression_loss": 21.237825393676758, + "distillation_loss": 1.5399556159973145, + "epoch": 5.21, + "learning_rate": 7.252135165243224e-05, + "loss": 22.165, + "step": 2080, + "task_loss": 1.030541181564331 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.08308876230000001, + "compression/movement_sparsity/importance_threshold": -0.0004590266572798221, + "compression/movement_sparsity/linear_layer_sparsity": 0.9735669084989461, + "compression/movement_sparsity/model_sparsity": 0.8752127751590698, + "compression_loss": 21.346500396728516, + "distillation_loss": 0.5890419483184814, + "epoch": 5.24, + "learning_rate": 7.233568510954327e-05, + "loss": 22.348, + "step": 2090, + "task_loss": 0.32965362071990967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0835433408, + "compression/movement_sparsity/importance_threshold": -0.00044668790047042075, + "compression/movement_sparsity/linear_layer_sparsity": 0.9736586306835291, + "compression/movement_sparsity/model_sparsity": 0.8752952311506428, + "compression_loss": 21.45292091369629, + "distillation_loss": 1.2875161170959473, + "epoch": 5.26, + "learning_rate": 7.215001856665428e-05, + "loss": 22.3559, + "step": 2100, + "task_loss": 0.6388518810272217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.08398969930000001, + "compression/movement_sparsity/importance_threshold": -0.00043457226151849245, + "compression/movement_sparsity/linear_layer_sparsity": 0.9737021863708221, + "compression/movement_sparsity/model_sparsity": 0.8753343866439481, + "compression_loss": 21.558067321777344, + "distillation_loss": 0.7127068042755127, + "epoch": 5.29, + "learning_rate": 7.196435202376533e-05, + "loss": 22.4019, + "step": 2110, + "task_loss": 0.5173584818840027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0844279128, + "compression/movement_sparsity/importance_threshold": -0.00042267770467716275, + "compression/movement_sparsity/linear_layer_sparsity": 0.9737336974932249, + "compression/movement_sparsity/model_sparsity": 0.8753627143691884, + "compression_loss": 21.661664962768555, + "distillation_loss": 0.9345526695251465, + "epoch": 5.31, + "learning_rate": 7.177868548087636e-05, + "loss": 22.6798, + "step": 2120, + "task_loss": 0.6000822186470032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0848580563, + "compression/movement_sparsity/importance_threshold": -0.00041100219419955633, + "compression/movement_sparsity/linear_layer_sparsity": 0.9738223850120445, + "compression/movement_sparsity/model_sparsity": 0.8754424422707606, + "compression_loss": 21.762226104736328, + "distillation_loss": 0.9101648330688477, + "epoch": 5.34, + "learning_rate": 7.159301893798737e-05, + "loss": 22.6571, + "step": 2130, + "task_loss": 0.5418741106987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.08528020480000001, + "compression/movement_sparsity/importance_threshold": -0.0003995436943387983, + "compression/movement_sparsity/linear_layer_sparsity": 0.9739351498983739, + "compression/movement_sparsity/model_sparsity": 0.8755438151344705, + "compression_loss": 21.860109329223633, + "distillation_loss": 0.8067530393600464, + "epoch": 5.36, + "learning_rate": 7.14073523950984e-05, + "loss": 22.7909, + "step": 2140, + "task_loss": 0.4676440358161926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0856944333, + "compression/movement_sparsity/importance_threshold": -0.00038830016934801505, + "compression/movement_sparsity/linear_layer_sparsity": 0.9740447624962361, + "compression/movement_sparsity/model_sparsity": 0.8756423541682572, + "compression_loss": 21.95547103881836, + "distillation_loss": 0.797344446182251, + "epoch": 5.39, + "learning_rate": 7.122168585220943e-05, + "loss": 22.86, + "step": 2150, + "task_loss": 0.5267999172210693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.08610081680000001, + "compression/movement_sparsity/importance_threshold": -0.00037726958348033, + "compression/movement_sparsity/linear_layer_sparsity": 0.9741336146868413, + "compression/movement_sparsity/model_sparsity": 0.8757222301057209, + "compression_loss": 22.048969268798828, + "distillation_loss": 1.4408375024795532, + "epoch": 5.41, + "learning_rate": 7.103601930932046e-05, + "loss": 22.9565, + "step": 2160, + "task_loss": 1.0003676414489746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.08649943030000001, + "compression/movement_sparsity/importance_threshold": -0.0003664499009888699, + "compression/movement_sparsity/linear_layer_sparsity": 0.9741347321025293, + "compression/movement_sparsity/model_sparsity": 0.8757232346349847, + "compression_loss": 22.1412410736084, + "distillation_loss": 0.5943891406059265, + "epoch": 5.44, + "learning_rate": 7.085035276643149e-05, + "loss": 23.0139, + "step": 2170, + "task_loss": 0.45891618728637695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0868903488, + "compression/movement_sparsity/importance_threshold": -0.00035583908612675996, + "compression/movement_sparsity/linear_layer_sparsity": 0.9741842277363746, + "compression/movement_sparsity/model_sparsity": 0.8757677299943768, + "compression_loss": 22.23069953918457, + "distillation_loss": 0.8800859451293945, + "epoch": 5.46, + "learning_rate": 7.066468622354252e-05, + "loss": 23.0837, + "step": 2180, + "task_loss": 0.7157919406890869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0872736473, + "compression/movement_sparsity/importance_threshold": -0.00034543510314712393, + "compression/movement_sparsity/linear_layer_sparsity": 0.9742577654509184, + "compression/movement_sparsity/model_sparsity": 0.8758338385939297, + "compression_loss": 22.31747817993164, + "distillation_loss": 1.4458627700805664, + "epoch": 5.49, + "learning_rate": 7.047901968065355e-05, + "loss": 23.2124, + "step": 2190, + "task_loss": 0.8665592670440674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0876494008, + "compression/movement_sparsity/importance_threshold": -0.0003352359163030891, + "compression/movement_sparsity/linear_layer_sparsity": 0.9742859360885275, + "compression/movement_sparsity/model_sparsity": 0.8758591633053707, + "compression_loss": 22.402511596679688, + "distillation_loss": 1.4791427850723267, + "epoch": 5.51, + "learning_rate": 7.029335313776457e-05, + "loss": 23.237, + "step": 2200, + "task_loss": 1.0247491598129272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.08801768430000001, + "compression/movement_sparsity/importance_threshold": -0.00032523948984777926, + "compression/movement_sparsity/linear_layer_sparsity": 0.9743054967442035, + "compression/movement_sparsity/model_sparsity": 0.8758767478544841, + "compression_loss": 22.486186981201172, + "distillation_loss": 0.8023099303245544, + "epoch": 5.54, + "learning_rate": 7.010768659487561e-05, + "loss": 23.3537, + "step": 2210, + "task_loss": 0.5102382898330688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0883785728, + "compression/movement_sparsity/importance_threshold": -0.00031544378803432, + "compression/movement_sparsity/linear_layer_sparsity": 0.9743065553485396, + "compression/movement_sparsity/model_sparsity": 0.8758776995137867, + "compression_loss": 22.567340850830078, + "distillation_loss": 0.5300278067588806, + "epoch": 5.56, + "learning_rate": 6.992202005198664e-05, + "loss": 23.4238, + "step": 2220, + "task_loss": 0.2836514711380005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.08873214130000001, + "compression/movement_sparsity/importance_threshold": -0.0003058467751158368, + "compression/movement_sparsity/linear_layer_sparsity": 0.974325598464318, + "compression/movement_sparsity/model_sparsity": 0.875894818807241, + "compression_loss": 22.646709442138672, + "distillation_loss": 1.0705655813217163, + "epoch": 5.59, + "learning_rate": 6.973635350909766e-05, + "loss": 23.5207, + "step": 2230, + "task_loss": 0.714119017124176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0890784648, + "compression/movement_sparsity/importance_threshold": -0.00029644641534545477, + "compression/movement_sparsity/linear_layer_sparsity": 0.9743568625790424, + "compression/movement_sparsity/model_sparsity": 0.8759229244786441, + "compression_loss": 22.723915100097656, + "distillation_loss": 0.949068546295166, + "epoch": 5.61, + "learning_rate": 6.955068696620869e-05, + "loss": 23.5363, + "step": 2240, + "task_loss": 0.706423819065094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0894176183, + "compression/movement_sparsity/importance_threshold": -0.00028724067297629906, + "compression/movement_sparsity/linear_layer_sparsity": 0.9743730945121951, + "compression/movement_sparsity/model_sparsity": 0.8759375165879504, + "compression_loss": 22.799074172973633, + "distillation_loss": 1.1835298538208008, + "epoch": 5.64, + "learning_rate": 6.936502042331973e-05, + "loss": 23.6207, + "step": 2250, + "task_loss": 0.8958919048309326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.08974967680000001, + "compression/movement_sparsity/importance_threshold": -0.0002782275122614952, + "compression/movement_sparsity/linear_layer_sparsity": 0.9744193555216802, + "compression/movement_sparsity/model_sparsity": 0.8759791040994733, + "compression_loss": 22.8729248046875, + "distillation_loss": 0.6387713551521301, + "epoch": 5.66, + "learning_rate": 6.917935388043075e-05, + "loss": 23.7426, + "step": 2260, + "task_loss": 0.4759640097618103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0900747153, + "compression/movement_sparsity/importance_threshold": -0.00026940489745416784, + "compression/movement_sparsity/linear_layer_sparsity": 0.974411333653267, + "compression/movement_sparsity/model_sparsity": 0.8759718926367582, + "compression_loss": 22.944398880004883, + "distillation_loss": 0.5631880760192871, + "epoch": 5.69, + "learning_rate": 6.899368733754178e-05, + "loss": 23.7749, + "step": 2270, + "task_loss": 0.27463316917419434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0903928088, + "compression/movement_sparsity/importance_threshold": -0.00026077079280744297, + "compression/movement_sparsity/linear_layer_sparsity": 0.9744394337172538, + "compression/movement_sparsity/model_sparsity": 0.8759971539042457, + "compression_loss": 23.01436996459961, + "distillation_loss": 1.4235520362854004, + "epoch": 5.71, + "learning_rate": 6.88080207946528e-05, + "loss": 23.8333, + "step": 2280, + "task_loss": 1.0092356204986572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0907040323, + "compression/movement_sparsity/importance_threshold": -0.0002523231625744457, + "compression/movement_sparsity/linear_layer_sparsity": 0.9744885059093646, + "compression/movement_sparsity/model_sparsity": 0.8760412685999167, + "compression_loss": 23.083436965942383, + "distillation_loss": 0.8579853773117065, + "epoch": 5.74, + "learning_rate": 6.862235425176384e-05, + "loss": 23.8762, + "step": 2290, + "task_loss": 0.5234992504119873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09100846080000001, + "compression/movement_sparsity/importance_threshold": -0.0002440599710083007, + "compression/movement_sparsity/linear_layer_sparsity": 0.9745796635049684, + "compression/movement_sparsity/model_sparsity": 0.8761232170398616, + "compression_loss": 23.150949478149414, + "distillation_loss": 1.0696903467178345, + "epoch": 5.76, + "learning_rate": 6.843668770887487e-05, + "loss": 24.0028, + "step": 2300, + "task_loss": 0.6211066246032715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09130616930000002, + "compression/movement_sparsity/importance_threshold": -0.00023597918236213352, + "compression/movement_sparsity/linear_layer_sparsity": 0.9745806397734116, + "compression/movement_sparsity/model_sparsity": 0.8761240946812184, + "compression_loss": 23.216720581054688, + "distillation_loss": 0.4247554540634155, + "epoch": 5.79, + "learning_rate": 6.82510211659859e-05, + "loss": 24.0176, + "step": 2310, + "task_loss": 0.2160654067993164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09159723280000001, + "compression/movement_sparsity/importance_threshold": -0.00022807876088907011, + "compression/movement_sparsity/linear_layer_sparsity": 0.9746447206225535, + "compression/movement_sparsity/model_sparsity": 0.8761817017910016, + "compression_loss": 23.279521942138672, + "distillation_loss": 0.8901571035385132, + "epoch": 5.81, + "learning_rate": 6.806535462309693e-05, + "loss": 24.0827, + "step": 2320, + "task_loss": 0.5934572219848633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09188172630000001, + "compression/movement_sparsity/importance_threshold": -0.00022035667084223474, + "compression/movement_sparsity/linear_layer_sparsity": 0.9747268330322192, + "compression/movement_sparsity/model_sparsity": 0.8762555188309056, + "compression_loss": 23.340944290161133, + "distillation_loss": 0.7367979884147644, + "epoch": 5.84, + "learning_rate": 6.787968808020794e-05, + "loss": 24.1653, + "step": 2330, + "task_loss": 0.7227636575698853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09215972480000001, + "compression/movement_sparsity/importance_threshold": -0.00021281087647475293, + "compression/movement_sparsity/linear_layer_sparsity": 0.974712730070009, + "compression/movement_sparsity/model_sparsity": 0.8762428406141967, + "compression_loss": 23.401010513305664, + "distillation_loss": 0.6449049711227417, + "epoch": 5.86, + "learning_rate": 6.769402153731897e-05, + "loss": 24.1697, + "step": 2340, + "task_loss": 0.4722440242767334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0924313033, + "compression/movement_sparsity/importance_threshold": -0.00020543934203975022, + "compression/movement_sparsity/linear_layer_sparsity": 0.9747678127822945, + "compression/movement_sparsity/model_sparsity": 0.876292358619908, + "compression_loss": 23.458995819091797, + "distillation_loss": 0.8528883457183838, + "epoch": 5.89, + "learning_rate": 6.750835499443002e-05, + "loss": 24.2242, + "step": 2350, + "task_loss": 0.5414759516716003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0926965368, + "compression/movement_sparsity/importance_threshold": -0.0001982400317903513, + "compression/movement_sparsity/linear_layer_sparsity": 0.9748377982911773, + "compression/movement_sparsity/model_sparsity": 0.8763552738738011, + "compression_loss": 23.51533317565918, + "distillation_loss": 0.6509031057357788, + "epoch": 5.91, + "learning_rate": 6.732268845154103e-05, + "loss": 24.2958, + "step": 2360, + "task_loss": 0.336683988571167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0929555003, + "compression/movement_sparsity/importance_threshold": -0.00019121090997968257, + "compression/movement_sparsity/linear_layer_sparsity": 0.9748416563158687, + "compression/movement_sparsity/model_sparsity": 0.8763587421432594, + "compression_loss": 23.570663452148438, + "distillation_loss": 1.0414843559265137, + "epoch": 5.94, + "learning_rate": 6.713702190865206e-05, + "loss": 24.3589, + "step": 2370, + "task_loss": 0.6974769830703735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0932082688, + "compression/movement_sparsity/importance_threshold": -0.00018434994086086783, + "compression/movement_sparsity/linear_layer_sparsity": 0.9748963626355014, + "compression/movement_sparsity/model_sparsity": 0.8764079217812186, + "compression_loss": 23.623455047607422, + "distillation_loss": 0.5835083723068237, + "epoch": 5.96, + "learning_rate": 6.695135536576309e-05, + "loss": 24.3341, + "step": 2380, + "task_loss": 0.5392003655433655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0934549173, + "compression/movement_sparsity/importance_threshold": -0.00017765508868703263, + "compression/movement_sparsity/linear_layer_sparsity": 0.974906160606745, + "compression/movement_sparsity/model_sparsity": 0.8764167299167637, + "compression_loss": 23.67535972595215, + "distillation_loss": 0.36937326192855835, + "epoch": 5.99, + "learning_rate": 6.676568882287412e-05, + "loss": 24.4471, + "step": 2390, + "task_loss": 0.22642618417739868 + }, + { + "epoch": 6.0, + "eval_accuracy": 0.9073256840247131, + "eval_loss": 24.235685348510742, + "eval_runtime": 33.673, + "eval_samples_per_second": 201.883, + "eval_steps_per_second": 3.178, + "step": 2394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09371925156250001, + "compression/movement_sparsity/importance_threshold": -0.00017048018670337084, + "compression/movement_sparsity/linear_layer_sparsity": 0.9749610433604337, + "compression/movement_sparsity/model_sparsity": 0.8764660681646066, + "compression_loss": 23.73293113708496, + "distillation_loss": 0.6465860605239868, + "epoch": 6.02, + "learning_rate": 6.658002227998515e-05, + "loss": 25.1396, + "step": 2400, + "task_loss": 0.3431563377380371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09395329218750001, + "compression/movement_sparsity/importance_threshold": -0.00016412755375791671, + "compression/movement_sparsity/linear_layer_sparsity": 0.974988719982686, + "compression/movement_sparsity/model_sparsity": 0.8764909487683731, + "compression_loss": 23.7828311920166, + "distillation_loss": 0.8212810754776001, + "epoch": 6.04, + "learning_rate": 6.639435573709618e-05, + "loss": 24.5321, + "step": 2410, + "task_loss": 0.5524295568466187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0941814453125, + "compression/movement_sparsity/importance_threshold": -0.0001579347269421304, + "compression/movement_sparsity/linear_layer_sparsity": 0.9750163966049382, + "compression/movement_sparsity/model_sparsity": 0.8765158293721396, + "compression_loss": 23.830631256103516, + "distillation_loss": 0.6097371578216553, + "epoch": 6.07, + "learning_rate": 6.620868919420721e-05, + "loss": 24.5868, + "step": 2420, + "task_loss": 0.4853460192680359 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0944037859375, + "compression/movement_sparsity/importance_threshold": -0.00015189967050913784, + "compression/movement_sparsity/linear_layer_sparsity": 0.9750260063798555, + "compression/movement_sparsity/model_sparsity": 0.8765244683238086, + "compression_loss": 23.876558303833008, + "distillation_loss": 0.4932361841201782, + "epoch": 6.09, + "learning_rate": 6.602302265131823e-05, + "loss": 24.608, + "step": 2430, + "task_loss": 0.24706822633743286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0946203890625, + "compression/movement_sparsity/importance_threshold": -0.0001460203487120633, + "compression/movement_sparsity/linear_layer_sparsity": 0.9750266533047275, + "compression/movement_sparsity/model_sparsity": 0.8765250498933824, + "compression_loss": 23.920989990234375, + "distillation_loss": 1.079487681388855, + "epoch": 6.12, + "learning_rate": 6.583735610842926e-05, + "loss": 24.616, + "step": 2440, + "task_loss": 0.7890568375587463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09483132968750001, + "compression/movement_sparsity/importance_threshold": -0.0001402947258040332, + "compression/movement_sparsity/linear_layer_sparsity": 0.9750906165311654, + "compression/movement_sparsity/model_sparsity": 0.8765825512632431, + "compression_loss": 23.964323043823242, + "distillation_loss": 0.5283878445625305, + "epoch": 6.14, + "learning_rate": 6.56516895655403e-05, + "loss": 24.6953, + "step": 2450, + "task_loss": 0.43571531772613525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09503668281250001, + "compression/movement_sparsity/importance_threshold": -0.00013472076603817176, + "compression/movement_sparsity/linear_layer_sparsity": 0.9751453463753388, + "compression/movement_sparsity/model_sparsity": 0.8766317520491869, + "compression_loss": 24.0065975189209, + "distillation_loss": 0.5408639907836914, + "epoch": 6.17, + "learning_rate": 6.546602302265132e-05, + "loss": 24.7311, + "step": 2460, + "task_loss": 0.2528020143508911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0952365234375, + "compression/movement_sparsity/importance_threshold": -0.00012929643366760496, + "compression/movement_sparsity/linear_layer_sparsity": 0.9751459462511292, + "compression/movement_sparsity/model_sparsity": 0.8766322913227916, + "compression_loss": 24.046323776245117, + "distillation_loss": 0.6228930950164795, + "epoch": 6.19, + "learning_rate": 6.528035647976235e-05, + "loss": 24.7191, + "step": 2470, + "task_loss": 0.3793519139289856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0954309265625, + "compression/movement_sparsity/importance_threshold": -0.0001240196929454579, + "compression/movement_sparsity/linear_layer_sparsity": 0.9752006408084914, + "compression/movement_sparsity/model_sparsity": 0.8766814603867585, + "compression_loss": 24.084972381591797, + "distillation_loss": 0.8466883897781372, + "epoch": 6.22, + "learning_rate": 6.509468993687338e-05, + "loss": 24.867, + "step": 2480, + "task_loss": 0.5272025465965271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0956199671875, + "compression/movement_sparsity/importance_threshold": -0.00011888850812485528, + "compression/movement_sparsity/linear_layer_sparsity": 0.9752012524465522, + "compression/movement_sparsity/model_sparsity": 0.8766820102343557, + "compression_loss": 24.122623443603516, + "distillation_loss": 1.0965839624404907, + "epoch": 6.24, + "learning_rate": 6.49090233939844e-05, + "loss": 24.8222, + "step": 2490, + "task_loss": 0.8911365866661072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09580372031250001, + "compression/movement_sparsity/importance_threshold": -0.00011390084345892304, + "compression/movement_sparsity/linear_layer_sparsity": 0.975268003331075, + "compression/movement_sparsity/model_sparsity": 0.8767420176403798, + "compression_loss": 24.158720016479492, + "distillation_loss": 0.5235607028007507, + "epoch": 6.27, + "learning_rate": 6.472335685109544e-05, + "loss": 24.8269, + "step": 2500, + "task_loss": 0.3162381052970886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09598226093750001, + "compression/movement_sparsity/importance_threshold": -0.00010905466320078631, + "compression/movement_sparsity/linear_layer_sparsity": 0.9752684855841613, + "compression/movement_sparsity/model_sparsity": 0.8767424511740621, + "compression_loss": 24.193584442138672, + "distillation_loss": 0.7700557708740234, + "epoch": 6.29, + "learning_rate": 6.453769030820647e-05, + "loss": 24.9027, + "step": 2510, + "task_loss": 0.47537004947662354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09615566406250001, + "compression/movement_sparsity/importance_threshold": -0.00010434793160357019, + "compression/movement_sparsity/linear_layer_sparsity": 0.9753051485809997, + "compression/movement_sparsity/model_sparsity": 0.8767754103079083, + "compression_loss": 24.227218627929688, + "distillation_loss": 0.5995447635650635, + "epoch": 6.32, + "learning_rate": 6.43520237653175e-05, + "loss": 24.9055, + "step": 2520, + "task_loss": 0.3423469066619873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0963240046875, + "compression/movement_sparsity/importance_threshold": -9.977861292039978e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9753596902288467, + "compression/movement_sparsity/model_sparsity": 0.876824441909976, + "compression_loss": 24.259553909301758, + "distillation_loss": 1.2136845588684082, + "epoch": 6.34, + "learning_rate": 6.416635722242851e-05, + "loss": 25.0306, + "step": 2530, + "task_loss": 0.7751702070236206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09648735781250001, + "compression/movement_sparsity/importance_threshold": -9.53446714044002e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9753509508619391, + "compression/movement_sparsity/model_sparsity": 0.8768165854337335, + "compression_loss": 24.291015625, + "distillation_loss": 0.6761437654495239, + "epoch": 6.37, + "learning_rate": 6.398069067953954e-05, + "loss": 25.0015, + "step": 2540, + "task_loss": 0.7406394481658936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0966457984375, + "compression/movement_sparsity/importance_threshold": -9.104407130869742e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9753874609492623, + "compression/movement_sparsity/model_sparsity": 0.8768494071056805, + "compression_loss": 24.321796417236328, + "distillation_loss": 0.8070812225341797, + "epoch": 6.39, + "learning_rate": 6.379502413665059e-05, + "loss": 25.0003, + "step": 2550, + "task_loss": 0.5367677807807922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0967994015625, + "compression/movement_sparsity/importance_threshold": -8.687477688641611e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9754059394760615, + "compression/movement_sparsity/model_sparsity": 0.8768660188475067, + "compression_loss": 24.350906372070312, + "distillation_loss": 0.4816051125526428, + "epoch": 6.42, + "learning_rate": 6.360935759376162e-05, + "loss": 25.0526, + "step": 2560, + "task_loss": 0.2525244355201721 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0969482421875, + "compression/movement_sparsity/importance_threshold": -8.283475239068139e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9754244650519421, + "compression/movement_sparsity/model_sparsity": 0.876882672885302, + "compression_loss": 24.378854751586914, + "distillation_loss": 0.8856838941574097, + "epoch": 6.44, + "learning_rate": 6.342369105087263e-05, + "loss": 25.0494, + "step": 2570, + "task_loss": 0.4870672821998596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09709239531250001, + "compression/movement_sparsity/importance_threshold": -7.892196207461878e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9754670209462512, + "compression/movement_sparsity/model_sparsity": 0.8769209295892659, + "compression_loss": 24.404521942138672, + "distillation_loss": 0.5772982239723206, + "epoch": 6.47, + "learning_rate": 6.323802450798366e-05, + "loss": 25.0121, + "step": 2580, + "task_loss": 0.4074034094810486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09723193593750001, + "compression/movement_sparsity/importance_threshold": -7.513437019135384e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9754764778116531, + "compression/movement_sparsity/model_sparsity": 0.8769294310790358, + "compression_loss": 24.428165435791016, + "distillation_loss": 0.8608008027076721, + "epoch": 6.49, + "learning_rate": 6.305235796509469e-05, + "loss": 25.0618, + "step": 2590, + "task_loss": 0.7958731651306152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0973669390625, + "compression/movement_sparsity/importance_threshold": -7.146994099401123e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9754949445761819, + "compression/movement_sparsity/model_sparsity": 0.8769460322468697, + "compression_loss": 24.451786041259766, + "distillation_loss": 0.6185022592544556, + "epoch": 6.52, + "learning_rate": 6.286669142220572e-05, + "loss": 25.0804, + "step": 2600, + "task_loss": 0.3595033288002014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0974974796875, + "compression/movement_sparsity/importance_threshold": -6.79266387357165e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9755224565266486, + "compression/movement_sparsity/model_sparsity": 0.8769707648147447, + "compression_loss": 24.47515869140625, + "distillation_loss": 0.9395725727081299, + "epoch": 6.54, + "learning_rate": 6.268102487931675e-05, + "loss": 25.1608, + "step": 2610, + "task_loss": 0.6106551289558411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09762363281250001, + "compression/movement_sparsity/importance_threshold": -6.450242766959475e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9755498626166818, + "compression/movement_sparsity/model_sparsity": 0.8769954022166895, + "compression_loss": 24.49726104736328, + "distillation_loss": 0.41476964950561523, + "epoch": 6.57, + "learning_rate": 6.249535833642778e-05, + "loss": 25.0991, + "step": 2620, + "task_loss": 0.2240511178970337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09774547343750001, + "compression/movement_sparsity/importance_threshold": -6.119527204877153e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9755411232497742, + "compression/movement_sparsity/model_sparsity": 0.876987545740447, + "compression_loss": 24.51894760131836, + "distillation_loss": 0.6852964162826538, + "epoch": 6.59, + "learning_rate": 6.230969179353881e-05, + "loss": 25.1583, + "step": 2630, + "task_loss": 0.6814752817153931 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0978630765625, + "compression/movement_sparsity/importance_threshold": -5.800313612637194e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9755504507302017, + "compression/movement_sparsity/model_sparsity": 0.876995930916302, + "compression_loss": 24.53898811340332, + "distillation_loss": 0.38854435086250305, + "epoch": 6.62, + "learning_rate": 6.212402525064983e-05, + "loss": 25.2094, + "step": 2640, + "task_loss": 0.24611139297485352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0979765171875, + "compression/movement_sparsity/importance_threshold": -5.492398415552066e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.97559592366757, + "compression/movement_sparsity/model_sparsity": 0.8770368099703442, + "compression_loss": 24.558744430541992, + "distillation_loss": 0.5636383295059204, + "epoch": 6.64, + "learning_rate": 6.193835870776087e-05, + "loss": 25.2708, + "step": 2650, + "task_loss": 0.4168766140937805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0980858703125, + "compression/movement_sparsity/importance_threshold": -5.195578038934409e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.97560532172162, + "compression/movement_sparsity/model_sparsity": 0.8770452585901527, + "compression_loss": 24.57725715637207, + "distillation_loss": 0.9742121696472168, + "epoch": 6.67, + "learning_rate": 6.17526921648719e-05, + "loss": 25.2465, + "step": 2660, + "task_loss": 0.527445912361145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09819121093750001, + "compression/movement_sparsity/importance_threshold": -4.909648908096604e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9756599104185486, + "compression/movement_sparsity/model_sparsity": 0.8770943324881894, + "compression_loss": 24.594053268432617, + "distillation_loss": 0.5447437763214111, + "epoch": 6.69, + "learning_rate": 6.156702562198292e-05, + "loss": 25.2253, + "step": 2670, + "task_loss": 0.46921461820602417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0982926140625, + "compression/movement_sparsity/importance_threshold": -4.634407448351335e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9756602044753087, + "compression/movement_sparsity/model_sparsity": 0.8770945968379956, + "compression_loss": 24.61036491394043, + "distillation_loss": 0.17850594222545624, + "epoch": 6.72, + "learning_rate": 6.138135907909395e-05, + "loss": 25.3016, + "step": 2680, + "task_loss": 0.07567036151885986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09839015468750001, + "compression/movement_sparsity/importance_threshold": -4.369650085011027e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9756695437180066, + "compression/movement_sparsity/model_sparsity": 0.877102992587843, + "compression_loss": 24.62556266784668, + "distillation_loss": 0.5694016218185425, + "epoch": 6.74, + "learning_rate": 6.119569253620498e-05, + "loss": 25.2341, + "step": 2690, + "task_loss": 0.40597182512283325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09848390781250001, + "compression/movement_sparsity/importance_threshold": -4.1151732433881896e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9756879634334538, + "compression/movement_sparsity/model_sparsity": 0.8771195514597079, + "compression_loss": 24.639806747436523, + "distillation_loss": 0.8171936273574829, + "epoch": 6.77, + "learning_rate": 6.1010025993316014e-05, + "loss": 25.2807, + "step": 2700, + "task_loss": 0.4754343628883362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0985739484375, + "compression/movement_sparsity/importance_threshold": -3.8707733487954206e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9756791770174646, + "compression/movement_sparsity/model_sparsity": 0.8771116526874965, + "compression_loss": 24.653369903564453, + "distillation_loss": 0.6007073521614075, + "epoch": 6.79, + "learning_rate": 6.082435945042704e-05, + "loss": 25.3002, + "step": 2710, + "task_loss": 0.36723655462265015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09866035156250001, + "compression/movement_sparsity/importance_threshold": -3.636246826545144e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9756794240251431, + "compression/movement_sparsity/model_sparsity": 0.8771118747413337, + "compression_loss": 24.665813446044922, + "distillation_loss": 0.7570244669914246, + "epoch": 6.82, + "learning_rate": 6.063869290753807e-05, + "loss": 25.2799, + "step": 2720, + "task_loss": 0.4490365982055664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0987431921875, + "compression/movement_sparsity/importance_threshold": -3.411390101950044e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9757278140055706, + "compression/movement_sparsity/model_sparsity": 0.8771553761454541, + "compression_loss": 24.677072525024414, + "distillation_loss": 0.49257898330688477, + "epoch": 6.84, + "learning_rate": 6.045302636464909e-05, + "loss": 25.2857, + "step": 2730, + "task_loss": 0.2251560091972351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09882254531250001, + "compression/movement_sparsity/importance_threshold": -3.195999600322458e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9757733222297501, + "compression/movement_sparsity/model_sparsity": 0.8771962869214731, + "compression_loss": 24.68729591369629, + "distillation_loss": 1.3124232292175293, + "epoch": 6.87, + "learning_rate": 6.026735982176012e-05, + "loss": 25.3743, + "step": 2740, + "task_loss": 0.8676391243934631 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0988984859375, + "compression/movement_sparsity/importance_threshold": -2.9898717469750268e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9758098205548028, + "compression/movement_sparsity/model_sparsity": 0.8772290980194278, + "compression_loss": 24.696430206298828, + "distillation_loss": 0.9397382736206055, + "epoch": 6.89, + "learning_rate": 6.0081693278871156e-05, + "loss": 25.2648, + "step": 2750, + "task_loss": 0.5887734889984131 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0989710890625, + "compression/movement_sparsity/importance_threshold": -2.792802967220261e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9758191480352304, + "compression/movement_sparsity/model_sparsity": 0.8772374831952828, + "compression_loss": 24.7053279876709, + "distillation_loss": 0.7364386320114136, + "epoch": 6.92, + "learning_rate": 5.989602673598218e-05, + "loss": 25.3351, + "step": 2760, + "task_loss": 0.4724128842353821 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0990404296875, + "compression/movement_sparsity/importance_threshold": -2.6045896863706713e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9758646209725986, + "compression/movement_sparsity/model_sparsity": 0.877278362249325, + "compression_loss": 24.713607788085938, + "distillation_loss": 0.6221116781234741, + "epoch": 6.94, + "learning_rate": 5.971036019309321e-05, + "loss": 25.3774, + "step": 2770, + "task_loss": 0.3534473776817322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09910658281250001, + "compression/movement_sparsity/importance_threshold": -2.4250283297387685e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9758647856443842, + "compression/movement_sparsity/model_sparsity": 0.8772785102852165, + "compression_loss": 24.721454620361328, + "distillation_loss": 0.8919824957847595, + "epoch": 6.97, + "learning_rate": 5.952469365020423e-05, + "loss": 25.3448, + "step": 2780, + "task_loss": 0.6073164343833923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0991696234375, + "compression/movement_sparsity/importance_threshold": -2.2539153226371067e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9758740425511894, + "compression/movement_sparsity/model_sparsity": 0.877286832017118, + "compression_loss": 24.728708267211914, + "distillation_loss": 0.5647550821304321, + "epoch": 6.99, + "learning_rate": 5.933902710731526e-05, + "loss": 25.366, + "step": 2790, + "task_loss": 0.3817031979560852 + }, + { + "epoch": 7.0, + "eval_accuracy": 0.9273315681082671, + "eval_loss": 25.089332580566406, + "eval_runtime": 34.4549, + "eval_samples_per_second": 197.301, + "eval_steps_per_second": 3.106, + "step": 2793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0992354627, + "compression/movement_sparsity/importance_threshold": -2.075205892143172e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9758923799307437, + "compression/movement_sparsity/model_sparsity": 0.8773033168710372, + "compression_loss": 24.736276626586914, + "distillation_loss": 0.6459752321243286, + "epoch": 7.02, + "learning_rate": 5.91533605644263e-05, + "loss": 25.9937, + "step": 2800, + "task_loss": 0.4211961030960083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09929221120000001, + "compression/movement_sparsity/importance_threshold": -1.9211717834472734e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9759107761216501, + "compression/movement_sparsity/model_sparsity": 0.8773198545949178, + "compression_loss": 24.742263793945312, + "distillation_loss": 0.5500500798225403, + "epoch": 7.04, + "learning_rate": 5.896769402153732e-05, + "loss": 25.3942, + "step": 2810, + "task_loss": 0.34465354681015015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0993460797, + "compression/movement_sparsity/importance_threshold": -1.774954942750399e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.975883958145137, + "compression/movement_sparsity/model_sparsity": 0.8772957458925856, + "compression_loss": 24.7469482421875, + "distillation_loss": 0.5227401256561279, + "epoch": 7.07, + "learning_rate": 5.878202747864835e-05, + "loss": 25.3356, + "step": 2820, + "task_loss": 0.49045073986053467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09939714320000001, + "compression/movement_sparsity/importance_threshold": -1.636351795365103e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9759023896228546, + "compression/movement_sparsity/model_sparsity": 0.8773123153384428, + "compression_loss": 24.750398635864258, + "distillation_loss": 0.7634038925170898, + "epoch": 7.09, + "learning_rate": 5.8596360935759375e-05, + "loss": 25.2789, + "step": 2830, + "task_loss": 0.5661014318466187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09944547670000001, + "compression/movement_sparsity/importance_threshold": -1.5051587666038962e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9758934973464318, + "compression/movement_sparsity/model_sparsity": 0.8773043214003011, + "compression_loss": 24.7532901763916, + "distillation_loss": 1.0801090002059937, + "epoch": 7.12, + "learning_rate": 5.8410694392870405e-05, + "loss": 25.3397, + "step": 2840, + "task_loss": 0.766120195388794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09949115520000001, + "compression/movement_sparsity/importance_threshold": -1.3811722817793325e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9758936855427582, + "compression/movement_sparsity/model_sparsity": 0.8773044905841771, + "compression_loss": 24.75606346130371, + "distillation_loss": 0.45150721073150635, + "epoch": 7.14, + "learning_rate": 5.822502784998144e-05, + "loss": 25.2989, + "step": 2850, + "task_loss": 0.2520253658294678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0995342537, + "compression/movement_sparsity/importance_threshold": -1.264188766203966e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9759028836382114, + "compression/movement_sparsity/model_sparsity": 0.8773127594461174, + "compression_loss": 24.758466720581055, + "distillation_loss": 0.5204466581344604, + "epoch": 7.17, + "learning_rate": 5.8039361307092465e-05, + "loss": 25.3156, + "step": 2860, + "task_loss": 0.3384796380996704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09957484720000001, + "compression/movement_sparsity/importance_threshold": -1.1540046451901775e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9759030012609153, + "compression/movement_sparsity/model_sparsity": 0.8773128651860399, + "compression_loss": 24.76046371459961, + "distillation_loss": 0.7581231594085693, + "epoch": 7.19, + "learning_rate": 5.7853694764203494e-05, + "loss": 25.405, + "step": 2870, + "task_loss": 0.43696796894073486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0996130107, + "compression/movement_sparsity/importance_threshold": -1.050416344050651e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9759212092554953, + "compression/movement_sparsity/model_sparsity": 0.8773292337260443, + "compression_loss": 24.761972427368164, + "distillation_loss": 0.9415094256401062, + "epoch": 7.22, + "learning_rate": 5.766802822131452e-05, + "loss": 25.3604, + "step": 2880, + "task_loss": 0.466688334941864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09964881920000002, + "compression/movement_sparsity/importance_threshold": -9.53220288097854e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9759485094850948, + "compression/movement_sparsity/model_sparsity": 0.8773537759620589, + "compression_loss": 24.76201629638672, + "distillation_loss": 0.42088085412979126, + "epoch": 7.24, + "learning_rate": 5.7482361678425554e-05, + "loss": 25.324, + "step": 2890, + "task_loss": 0.2598356008529663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0996823477, + "compression/movement_sparsity/importance_threshold": -8.622129026442972e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.975966811577838, + "compression/movement_sparsity/model_sparsity": 0.8773702290940013, + "compression_loss": 24.76245880126953, + "distillation_loss": 0.6283578872680664, + "epoch": 7.27, + "learning_rate": 5.7296695135536584e-05, + "loss": 25.3934, + "step": 2900, + "task_loss": 0.49093014001846313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0997136712, + "compression/movement_sparsity/importance_threshold": -7.771906130025347e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9759940529960855, + "compression/movement_sparsity/model_sparsity": 0.8773947184600546, + "compression_loss": 24.76268768310547, + "distillation_loss": 0.7418371438980103, + "epoch": 7.29, + "learning_rate": 5.711102859264761e-05, + "loss": 25.3395, + "step": 2910, + "task_loss": 0.6282532811164856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0997428647, + "compression/movement_sparsity/importance_threshold": -6.979498444850338e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9760122374661246, + "compression/movement_sparsity/model_sparsity": 0.8774110658520745, + "compression_loss": 24.76232147216797, + "distillation_loss": 0.2934180200099945, + "epoch": 7.32, + "learning_rate": 5.692536204975864e-05, + "loss": 25.356, + "step": 2920, + "task_loss": 0.30722153186798096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09977000320000001, + "compression/movement_sparsity/importance_threshold": -6.242870224043486e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9760576045430593, + "compression/movement_sparsity/model_sparsity": 0.8774518497401864, + "compression_loss": 24.7608585357666, + "distillation_loss": 0.3775867223739624, + "epoch": 7.34, + "learning_rate": 5.673969550686966e-05, + "loss": 25.3606, + "step": 2930, + "task_loss": 0.2320680022239685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09979516170000001, + "compression/movement_sparsity/importance_threshold": -5.559985720730767e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9760939029095153, + "compression/movement_sparsity/model_sparsity": 0.8774844810802729, + "compression_loss": 24.758522033691406, + "distillation_loss": 0.4381892681121826, + "epoch": 7.37, + "learning_rate": 5.65540289639807e-05, + "loss": 25.3037, + "step": 2940, + "task_loss": 0.2565076947212219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09981841520000001, + "compression/movement_sparsity/importance_threshold": -4.928809188036419e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9761392229373682, + "compression/movement_sparsity/model_sparsity": 0.8775252226724158, + "compression_loss": 24.75603675842285, + "distillation_loss": 0.9314266443252563, + "epoch": 7.39, + "learning_rate": 5.6368362421091727e-05, + "loss": 25.3719, + "step": 2950, + "task_loss": 0.5608310103416443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09983983870000002, + "compression/movement_sparsity/importance_threshold": -4.347304879085551e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9761484327950919, + "compression/movement_sparsity/model_sparsity": 0.8775335021083484, + "compression_loss": 24.753707885742188, + "distillation_loss": 0.5958097577095032, + "epoch": 7.42, + "learning_rate": 5.618269587820275e-05, + "loss": 25.4208, + "step": 2960, + "task_loss": 0.4646613597869873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0998595072, + "compression/movement_sparsity/importance_threshold": -3.8134370470045698e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9761214971958747, + "compression/movement_sparsity/model_sparsity": 0.8775092876660936, + "compression_loss": 24.750883102416992, + "distillation_loss": 0.44721275568008423, + "epoch": 7.44, + "learning_rate": 5.599702933531378e-05, + "loss": 25.3144, + "step": 2970, + "task_loss": 0.33150601387023926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0998774957, + "compression/movement_sparsity/importance_threshold": -3.32516994491815e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9761216971544715, + "compression/movement_sparsity/model_sparsity": 0.8775094674239619, + "compression_loss": 24.74720001220703, + "distillation_loss": 0.9337691068649292, + "epoch": 7.47, + "learning_rate": 5.58113627924248e-05, + "loss": 25.3668, + "step": 2980, + "task_loss": 0.7225836515426636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09989387920000001, + "compression/movement_sparsity/importance_threshold": -2.880467825950965e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9761489738595303, + "compression/movement_sparsity/model_sparsity": 0.8775339885119918, + "compression_loss": 24.743270874023438, + "distillation_loss": 0.4354209303855896, + "epoch": 7.49, + "learning_rate": 5.562569624953584e-05, + "loss": 25.3452, + "step": 2990, + "task_loss": 0.3905031681060791 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0999087327, + "compression/movement_sparsity/importance_threshold": -2.4772949432289894e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9761672994768142, + "compression/movement_sparsity/model_sparsity": 0.8775504627919188, + "compression_loss": 24.738487243652344, + "distillation_loss": 0.545594334602356, + "epoch": 7.52, + "learning_rate": 5.544002970664687e-05, + "loss": 25.2934, + "step": 3000, + "task_loss": 0.48204469680786133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0999221312, + "compression/movement_sparsity/importance_threshold": -2.1136155498773304e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9761675464844927, + "compression/movement_sparsity/model_sparsity": 0.8775506848457562, + "compression_loss": 24.733604431152344, + "distillation_loss": 0.46750152111053467, + "epoch": 7.54, + "learning_rate": 5.525436316375789e-05, + "loss": 25.2953, + "step": 3010, + "task_loss": 0.2321946620941162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0999341497, + "compression/movement_sparsity/importance_threshold": -1.7873938990210954e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9761496560712135, + "compression/movement_sparsity/model_sparsity": 0.8775346018035424, + "compression_loss": 24.729167938232422, + "distillation_loss": 0.6891891956329346, + "epoch": 7.57, + "learning_rate": 5.506869662086892e-05, + "loss": 25.3244, + "step": 3020, + "task_loss": 0.33235394954681396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09994486320000001, + "compression/movement_sparsity/importance_threshold": -1.4965942437853916e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9761768739649203, + "compression/movement_sparsity/model_sparsity": 0.8775590700216112, + "compression_loss": 24.72458267211914, + "distillation_loss": 0.5351017713546753, + "epoch": 7.59, + "learning_rate": 5.4883030077979945e-05, + "loss": 25.3094, + "step": 3030, + "task_loss": 0.41574662923812866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0999543467, + "compression/movement_sparsity/importance_threshold": -1.23918083729576e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9761679581639566, + "compression/movement_sparsity/model_sparsity": 0.8775510549354849, + "compression_loss": 24.719385147094727, + "distillation_loss": 0.6737346053123474, + "epoch": 7.62, + "learning_rate": 5.469736353509098e-05, + "loss": 25.2518, + "step": 3040, + "task_loss": 0.6407185792922974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0999626752, + "compression/movement_sparsity/importance_threshold": -1.0131179326773078e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9761590658875339, + "compression/movement_sparsity/model_sparsity": 0.8775430609973431, + "compression_loss": 24.713476181030273, + "distillation_loss": 0.7363412976264954, + "epoch": 7.64, + "learning_rate": 5.451169699220201e-05, + "loss": 25.3053, + "step": 3050, + "task_loss": 0.38132327795028687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0999699237, + "compression/movement_sparsity/importance_threshold": -8.163697830555761e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9761502324224631, + "compression/movement_sparsity/model_sparsity": 0.8775351199291627, + "compression_loss": 24.70734405517578, + "distillation_loss": 0.5943996906280518, + "epoch": 7.67, + "learning_rate": 5.4326030449313035e-05, + "loss": 25.2634, + "step": 3060, + "task_loss": 0.34441137313842773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09997616720000001, + "compression/movement_sparsity/importance_threshold": -6.469006415548047e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9762135604862993, + "compression/movement_sparsity/model_sparsity": 0.8775920503034419, + "compression_loss": 24.701248168945312, + "distillation_loss": 0.8021811246871948, + "epoch": 7.69, + "learning_rate": 5.4140363906424065e-05, + "loss": 25.296, + "step": 3070, + "task_loss": 0.5133693218231201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09998148070000001, + "compression/movement_sparsity/importance_threshold": -5.02674761301402e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9762318155299609, + "compression/movement_sparsity/model_sparsity": 0.8776084611394154, + "compression_loss": 24.6949520111084, + "distillation_loss": 1.1157962083816528, + "epoch": 7.72, + "learning_rate": 5.395469736353509e-05, + "loss": 25.2701, + "step": 3080, + "task_loss": 0.6061588525772095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0999859392, + "compression/movement_sparsity/importance_threshold": -3.816563954204752e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9762319331526649, + "compression/movement_sparsity/model_sparsity": 0.8776085668793379, + "compression_loss": 24.68860626220703, + "distillation_loss": 0.46376800537109375, + "epoch": 7.74, + "learning_rate": 5.3769030820646124e-05, + "loss": 25.2613, + "step": 3090, + "task_loss": 0.2882155776023865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0999896177, + "compression/movement_sparsity/importance_threshold": -2.81809797036698e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9762232055480277, + "compression/movement_sparsity/model_sparsity": 0.8776007209770876, + "compression_loss": 24.68145751953125, + "distillation_loss": 0.42821407318115234, + "epoch": 7.77, + "learning_rate": 5.3583364277757154e-05, + "loss": 25.2447, + "step": 3100, + "task_loss": 0.2803751826286316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09999259120000001, + "compression/movement_sparsity/importance_threshold": -2.0109921927561128e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9762233231707317, + "compression/movement_sparsity/model_sparsity": 0.8776008267170101, + "compression_loss": 24.674474716186523, + "distillation_loss": 0.40806517004966736, + "epoch": 7.79, + "learning_rate": 5.339769773486818e-05, + "loss": 25.1801, + "step": 3110, + "task_loss": 0.241238534450531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09999493470000001, + "compression/movement_sparsity/importance_threshold": -1.3748891526232238e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9762505410644384, + "compression/movement_sparsity/model_sparsity": 0.8776252949350789, + "compression_loss": 24.667421340942383, + "distillation_loss": 0.4663747549057007, + "epoch": 7.82, + "learning_rate": 5.321203119197921e-05, + "loss": 25.2171, + "step": 3120, + "task_loss": 0.3210309147834778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09999672320000001, + "compression/movement_sparsity/importance_threshold": -8.894313812237223e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9762506704494128, + "compression/movement_sparsity/model_sparsity": 0.8776254112489936, + "compression_loss": 24.660564422607422, + "distillation_loss": 0.6921601295471191, + "epoch": 7.84, + "learning_rate": 5.302636464909023e-05, + "loss": 25.2151, + "step": 3130, + "task_loss": 0.3866957426071167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0999980317, + "compression/movement_sparsity/importance_threshold": -5.342614098086812e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9762688431571815, + "compression/movement_sparsity/model_sparsity": 0.8776417480670213, + "compression_loss": 24.65365982055664, + "distillation_loss": 0.7762662172317505, + "epoch": 7.87, + "learning_rate": 5.284069810620127e-05, + "loss": 25.1716, + "step": 3140, + "task_loss": 0.5304232835769653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09999893520000001, + "compression/movement_sparsity/importance_threshold": -2.890217696291733e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9762780530149051, + "compression/movement_sparsity/model_sparsity": 0.8776500275029538, + "compression_loss": 24.646541595458984, + "distillation_loss": 0.7200348973274231, + "epoch": 7.89, + "learning_rate": 5.26550315633123e-05, + "loss": 25.1668, + "step": 3150, + "task_loss": 0.526270866394043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0999995087, + "compression/movement_sparsity/importance_threshold": -1.3335499194494504e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9762511527024993, + "compression/movement_sparsity/model_sparsity": 0.8776258447826759, + "compression_loss": 24.639205932617188, + "distillation_loss": 0.3733695149421692, + "epoch": 7.92, + "learning_rate": 5.246936502042332e-05, + "loss": 25.1241, + "step": 3160, + "task_loss": 0.3617507815361023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0999998272, + "compression/movement_sparsity/importance_threshold": -4.690360799405879e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.97626938422162, + "compression/movement_sparsity/model_sparsity": 0.8776422344706649, + "compression_loss": 24.632028579711914, + "distillation_loss": 0.3199753165245056, + "epoch": 7.94, + "learning_rate": 5.228369847753435e-05, + "loss": 25.1482, + "step": 3170, + "task_loss": 0.23808979988098145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09999996570000001, + "compression/movement_sparsity/importance_threshold": -9.310149040597793e-10, + "compression/movement_sparsity/linear_layer_sparsity": 0.9762965785907859, + "compression/movement_sparsity/model_sparsity": 0.8776666815407491, + "compression_loss": 24.62479591369629, + "distillation_loss": 0.8704989552497864, + "epoch": 7.97, + "learning_rate": 5.209803193464537e-05, + "loss": 25.1702, + "step": 3180, + "task_loss": 0.5461669564247131 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09999999920000001, + "compression/movement_sparsity/importance_threshold": -2.1714633129804595e-11, + "compression/movement_sparsity/linear_layer_sparsity": 0.9763057061126167, + "compression/movement_sparsity/model_sparsity": 0.8776748869587359, + "compression_loss": 24.61789894104004, + "distillation_loss": 0.3547750413417816, + "epoch": 7.99, + "learning_rate": 5.191236539175641e-05, + "loss": 25.1369, + "step": 3190, + "task_loss": 0.3040663003921509 + }, + { + "epoch": 8.0, + "eval_accuracy": 0.9393939393939394, + "eval_loss": 24.897581100463867, + "eval_runtime": 32.2307, + "eval_samples_per_second": 210.917, + "eval_steps_per_second": 3.32, + "step": 3192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.72288978099823, + "epoch": 8.02, + "learning_rate": 5.172669884886744e-05, + "loss": 6.5592, + "step": 3200, + "task_loss": 0.7085144519805908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 1.2470171451568604, + "epoch": 8.05, + "learning_rate": 5.155959896026736e-05, + "loss": 0.8523, + "step": 3210, + "task_loss": 0.8521549701690674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.5626842379570007, + "epoch": 8.07, + "learning_rate": 5.137393241737839e-05, + "loss": 0.6437, + "step": 3220, + "task_loss": 0.7031203508377075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.6048372983932495, + "epoch": 8.1, + "learning_rate": 5.1206832528778315e-05, + "loss": 0.6976, + "step": 3230, + "task_loss": 0.26381248235702515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4602692127227783, + "epoch": 8.12, + "learning_rate": 5.102116598588934e-05, + "loss": 0.6751, + "step": 3240, + "task_loss": 0.3525872230529785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4143044948577881, + "epoch": 8.15, + "learning_rate": 5.083549944300038e-05, + "loss": 0.6573, + "step": 3250, + "task_loss": 0.21413344144821167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.6834879517555237, + "epoch": 8.17, + "learning_rate": 5.0649832900111405e-05, + "loss": 0.5948, + "step": 3260, + "task_loss": 0.40228837728500366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4829801023006439, + "epoch": 8.2, + "learning_rate": 5.046416635722243e-05, + "loss": 0.6649, + "step": 3270, + "task_loss": 0.3167864680290222 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.5986191034317017, + "epoch": 8.22, + "learning_rate": 5.027849981433346e-05, + "loss": 0.6497, + "step": 3280, + "task_loss": 0.35720396041870117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.5706828832626343, + "epoch": 8.25, + "learning_rate": 5.009283327144448e-05, + "loss": 0.5852, + "step": 3290, + "task_loss": 0.45791006088256836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.6524888277053833, + "epoch": 8.27, + "learning_rate": 4.990716672855552e-05, + "loss": 0.648, + "step": 3300, + "task_loss": 0.6969070434570312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.5508350133895874, + "epoch": 8.3, + "learning_rate": 4.972150018566655e-05, + "loss": 0.5942, + "step": 3310, + "task_loss": 0.39406853914260864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.45713961124420166, + "epoch": 8.32, + "learning_rate": 4.953583364277758e-05, + "loss": 0.5238, + "step": 3320, + "task_loss": 0.33260881900787354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.8765674829483032, + "epoch": 8.35, + "learning_rate": 4.93501670998886e-05, + "loss": 0.5391, + "step": 3330, + "task_loss": 0.6223745942115784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.7946939468383789, + "epoch": 8.37, + "learning_rate": 4.916450055699963e-05, + "loss": 0.5532, + "step": 3340, + "task_loss": 0.5723085999488831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.7733598947525024, + "epoch": 8.4, + "learning_rate": 4.897883401411066e-05, + "loss": 0.5068, + "step": 3350, + "task_loss": 0.5443770885467529 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3593675494194031, + "epoch": 8.42, + "learning_rate": 4.879316747122169e-05, + "loss": 0.5747, + "step": 3360, + "task_loss": 0.23313909769058228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.709130585193634, + "epoch": 8.45, + "learning_rate": 4.860750092833272e-05, + "loss": 0.6165, + "step": 3370, + "task_loss": 0.5371474027633667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.8647085428237915, + "epoch": 8.47, + "learning_rate": 4.842183438544374e-05, + "loss": 0.5889, + "step": 3380, + "task_loss": 0.6712241768836975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.5399154424667358, + "epoch": 8.5, + "learning_rate": 4.823616784255477e-05, + "loss": 0.5368, + "step": 3390, + "task_loss": 0.4131709933280945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.432843416929245, + "epoch": 8.52, + "learning_rate": 4.80505012996658e-05, + "loss": 0.5734, + "step": 3400, + "task_loss": 0.21259355545043945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.8370758295059204, + "epoch": 8.55, + "learning_rate": 4.786483475677683e-05, + "loss": 0.5052, + "step": 3410, + "task_loss": 0.5703924894332886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.5906082391738892, + "epoch": 8.57, + "learning_rate": 4.767916821388786e-05, + "loss": 0.5002, + "step": 3420, + "task_loss": 0.34540170431137085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4898844063282013, + "epoch": 8.6, + "learning_rate": 4.7493501670998885e-05, + "loss": 0.4693, + "step": 3430, + "task_loss": 0.4294593334197998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.6780542135238647, + "epoch": 8.62, + "learning_rate": 4.7307835128109915e-05, + "loss": 0.561, + "step": 3440, + "task_loss": 0.5066669583320618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.8965948224067688, + "epoch": 8.65, + "learning_rate": 4.7122168585220945e-05, + "loss": 0.5198, + "step": 3450, + "task_loss": 0.4829084277153015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.9280937910079956, + "epoch": 8.67, + "learning_rate": 4.6936502042331975e-05, + "loss": 0.5237, + "step": 3460, + "task_loss": 0.4782121181488037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.26591920852661133, + "epoch": 8.7, + "learning_rate": 4.6750835499443005e-05, + "loss": 0.5247, + "step": 3470, + "task_loss": 0.16707593202590942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 1.0053449869155884, + "epoch": 8.72, + "learning_rate": 4.6565168956554035e-05, + "loss": 0.5456, + "step": 3480, + "task_loss": 0.6738066673278809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.6823339462280273, + "epoch": 8.75, + "learning_rate": 4.637950241366506e-05, + "loss": 0.4965, + "step": 3490, + "task_loss": 0.43921583890914917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.5958259105682373, + "epoch": 8.77, + "learning_rate": 4.619383587077609e-05, + "loss": 0.4856, + "step": 3500, + "task_loss": 0.39036643505096436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.8999524116516113, + "epoch": 8.8, + "learning_rate": 4.600816932788712e-05, + "loss": 0.5925, + "step": 3510, + "task_loss": 0.7617757320404053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.24529604613780975, + "epoch": 8.82, + "learning_rate": 4.582250278499815e-05, + "loss": 0.503, + "step": 3520, + "task_loss": 0.3046609163284302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.21389566361904144, + "epoch": 8.85, + "learning_rate": 4.563683624210918e-05, + "loss": 0.4983, + "step": 3530, + "task_loss": 0.10367703437805176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4183430075645447, + "epoch": 8.87, + "learning_rate": 4.54511696992202e-05, + "loss": 0.4765, + "step": 3540, + "task_loss": 0.23815250396728516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3706932067871094, + "epoch": 8.9, + "learning_rate": 4.526550315633123e-05, + "loss": 0.4768, + "step": 3550, + "task_loss": 0.45666825771331787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.24162426590919495, + "epoch": 8.92, + "learning_rate": 4.507983661344226e-05, + "loss": 0.4536, + "step": 3560, + "task_loss": 0.1500059962272644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4512966275215149, + "epoch": 8.95, + "learning_rate": 4.489417007055329e-05, + "loss": 0.499, + "step": 3570, + "task_loss": 0.28521621227264404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3897574841976166, + "epoch": 8.97, + "learning_rate": 4.470850352766432e-05, + "loss": 0.5217, + "step": 3580, + "task_loss": 0.39923566579818726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.6049056053161621, + "epoch": 9.0, + "learning_rate": 4.452283698477534e-05, + "loss": 0.4678, + "step": 3590, + "task_loss": 0.42887067794799805 + }, + { + "epoch": 9.0, + "eval_accuracy": 0.9435127978817299, + "eval_loss": 0.25284168124198914, + "eval_runtime": 31.9037, + "eval_samples_per_second": 213.079, + "eval_steps_per_second": 3.354, + "step": 3591 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.2613956332206726, + "epoch": 9.02, + "learning_rate": 4.433717044188637e-05, + "loss": 0.4558, + "step": 3600, + "task_loss": 0.17715340852737427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4356125295162201, + "epoch": 9.05, + "learning_rate": 4.41515038989974e-05, + "loss": 0.4894, + "step": 3610, + "task_loss": 0.317924439907074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4117315411567688, + "epoch": 9.07, + "learning_rate": 4.396583735610843e-05, + "loss": 0.4427, + "step": 3620, + "task_loss": 0.6888800859451294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3136410415172577, + "epoch": 9.1, + "learning_rate": 4.378017081321946e-05, + "loss": 0.4228, + "step": 3630, + "task_loss": 0.2511032223701477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.30205368995666504, + "epoch": 9.12, + "learning_rate": 4.3594504270330486e-05, + "loss": 0.46, + "step": 3640, + "task_loss": 0.20458006858825684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.37457796931266785, + "epoch": 9.15, + "learning_rate": 4.3408837727441515e-05, + "loss": 0.4972, + "step": 3650, + "task_loss": 0.2535884976387024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.5417513847351074, + "epoch": 9.17, + "learning_rate": 4.3223171184552545e-05, + "loss": 0.4358, + "step": 3660, + "task_loss": 0.3054230213165283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.371917724609375, + "epoch": 9.2, + "learning_rate": 4.3037504641663575e-05, + "loss": 0.4257, + "step": 3670, + "task_loss": 0.22020912170410156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.5019680857658386, + "epoch": 9.22, + "learning_rate": 4.2851838098774605e-05, + "loss": 0.505, + "step": 3680, + "task_loss": 0.3748323321342468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.17344295978546143, + "epoch": 9.25, + "learning_rate": 4.266617155588563e-05, + "loss": 0.4758, + "step": 3690, + "task_loss": 0.08035105466842651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.6138920187950134, + "epoch": 9.27, + "learning_rate": 4.248050501299666e-05, + "loss": 0.3982, + "step": 3700, + "task_loss": 0.39185112714767456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.6749163269996643, + "epoch": 9.3, + "learning_rate": 4.229483847010769e-05, + "loss": 0.4836, + "step": 3710, + "task_loss": 0.5913939476013184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.24625588953495026, + "epoch": 9.32, + "learning_rate": 4.210917192721872e-05, + "loss": 0.3976, + "step": 3720, + "task_loss": 0.16921091079711914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.1086372509598732, + "epoch": 9.35, + "learning_rate": 4.192350538432975e-05, + "loss": 0.4253, + "step": 3730, + "task_loss": 0.05092066526412964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.5961346626281738, + "epoch": 9.37, + "learning_rate": 4.173783884144077e-05, + "loss": 0.4541, + "step": 3740, + "task_loss": 0.34825223684310913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.47256791591644287, + "epoch": 9.4, + "learning_rate": 4.15521722985518e-05, + "loss": 0.4663, + "step": 3750, + "task_loss": 0.5208013653755188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.2828874886035919, + "epoch": 9.42, + "learning_rate": 4.136650575566283e-05, + "loss": 0.4305, + "step": 3760, + "task_loss": 0.3413585424423218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.480175256729126, + "epoch": 9.45, + "learning_rate": 4.118083921277386e-05, + "loss": 0.4078, + "step": 3770, + "task_loss": 0.5471338033676147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.5372006893157959, + "epoch": 9.47, + "learning_rate": 4.099517266988489e-05, + "loss": 0.3807, + "step": 3780, + "task_loss": 0.5899592041969299 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.23467813432216644, + "epoch": 9.5, + "learning_rate": 4.080950612699591e-05, + "loss": 0.4435, + "step": 3790, + "task_loss": 0.15050894021987915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.7140940427780151, + "epoch": 9.52, + "learning_rate": 4.062383958410695e-05, + "loss": 0.4246, + "step": 3800, + "task_loss": 0.5424280762672424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.47991132736206055, + "epoch": 9.55, + "learning_rate": 4.043817304121797e-05, + "loss": 0.3975, + "step": 3810, + "task_loss": 0.2305130958557129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.7091490030288696, + "epoch": 9.57, + "learning_rate": 4.0252506498329e-05, + "loss": 0.4422, + "step": 3820, + "task_loss": 0.5321277976036072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4778183102607727, + "epoch": 9.6, + "learning_rate": 4.006683995544003e-05, + "loss": 0.4259, + "step": 3830, + "task_loss": 0.3826780915260315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.8267277479171753, + "epoch": 9.62, + "learning_rate": 3.988117341255106e-05, + "loss": 0.4805, + "step": 3840, + "task_loss": 0.4951780438423157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.33411291241645813, + "epoch": 9.65, + "learning_rate": 3.969550686966209e-05, + "loss": 0.4404, + "step": 3850, + "task_loss": 0.19210630655288696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.12147143483161926, + "epoch": 9.67, + "learning_rate": 3.9509840326773116e-05, + "loss": 0.3758, + "step": 3860, + "task_loss": 0.36583495140075684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.2699703574180603, + "epoch": 9.7, + "learning_rate": 3.9324173783884146e-05, + "loss": 0.4872, + "step": 3870, + "task_loss": 0.1552247405052185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.32069188356399536, + "epoch": 9.72, + "learning_rate": 3.9138507240995175e-05, + "loss": 0.378, + "step": 3880, + "task_loss": 0.2125515341758728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.497938334941864, + "epoch": 9.75, + "learning_rate": 3.8952840698106205e-05, + "loss": 0.4068, + "step": 3890, + "task_loss": 0.416523277759552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.36496463418006897, + "epoch": 9.77, + "learning_rate": 3.8767174155217235e-05, + "loss": 0.3873, + "step": 3900, + "task_loss": 0.2737278938293457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4038648009300232, + "epoch": 9.8, + "learning_rate": 3.858150761232826e-05, + "loss": 0.3727, + "step": 3910, + "task_loss": 0.27509230375289917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.5693778991699219, + "epoch": 9.82, + "learning_rate": 3.839584106943929e-05, + "loss": 0.4398, + "step": 3920, + "task_loss": 0.5003637671470642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.7274796962738037, + "epoch": 9.85, + "learning_rate": 3.821017452655032e-05, + "loss": 0.4433, + "step": 3930, + "task_loss": 0.47839629650115967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.23561792075634003, + "epoch": 9.87, + "learning_rate": 3.802450798366135e-05, + "loss": 0.4133, + "step": 3940, + "task_loss": 0.20580154657363892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3768826723098755, + "epoch": 9.9, + "learning_rate": 3.783884144077238e-05, + "loss": 0.4062, + "step": 3950, + "task_loss": 0.21568721532821655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.26365137100219727, + "epoch": 9.92, + "learning_rate": 3.76531748978834e-05, + "loss": 0.3533, + "step": 3960, + "task_loss": 0.26262664794921875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.2154502123594284, + "epoch": 9.95, + "learning_rate": 3.746750835499443e-05, + "loss": 0.3956, + "step": 3970, + "task_loss": 0.1700526475906372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4734809994697571, + "epoch": 9.97, + "learning_rate": 3.728184181210546e-05, + "loss": 0.3946, + "step": 3980, + "task_loss": 0.44668906927108765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.5860964059829712, + "epoch": 10.0, + "learning_rate": 3.709617526921649e-05, + "loss": 0.3576, + "step": 3990, + "task_loss": 0.39208894968032837 + }, + { + "epoch": 10.0, + "eval_accuracy": 0.9613121506325389, + "eval_loss": 0.18729890882968903, + "eval_runtime": 31.4986, + "eval_samples_per_second": 215.819, + "eval_steps_per_second": 3.397, + "step": 3990 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.33489423990249634, + "epoch": 10.03, + "learning_rate": 3.691050872632752e-05, + "loss": 0.3657, + "step": 4000, + "task_loss": 0.4493994116783142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.17722754180431366, + "epoch": 10.05, + "learning_rate": 3.672484218343854e-05, + "loss": 0.4084, + "step": 4010, + "task_loss": 0.09126198291778564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.5721703171730042, + "epoch": 10.08, + "learning_rate": 3.653917564054957e-05, + "loss": 0.3856, + "step": 4020, + "task_loss": 0.33346736431121826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.14619529247283936, + "epoch": 10.1, + "learning_rate": 3.63535090976606e-05, + "loss": 0.3946, + "step": 4030, + "task_loss": 0.10846388339996338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.49323850870132446, + "epoch": 10.13, + "learning_rate": 3.616784255477163e-05, + "loss": 0.3667, + "step": 4040, + "task_loss": 0.21929258108139038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.24016423523426056, + "epoch": 10.15, + "learning_rate": 3.598217601188266e-05, + "loss": 0.3747, + "step": 4050, + "task_loss": 0.2389882206916809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.2317166030406952, + "epoch": 10.18, + "learning_rate": 3.5796509468993686e-05, + "loss": 0.4108, + "step": 4060, + "task_loss": 0.1898442506790161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.2921193838119507, + "epoch": 10.2, + "learning_rate": 3.5610842926104716e-05, + "loss": 0.3237, + "step": 4070, + "task_loss": 0.16486376523971558 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3699590861797333, + "epoch": 10.23, + "learning_rate": 3.5425176383215746e-05, + "loss": 0.3565, + "step": 4080, + "task_loss": 0.15236175060272217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.874591588973999, + "epoch": 10.25, + "learning_rate": 3.5239509840326776e-05, + "loss": 0.4556, + "step": 4090, + "task_loss": 0.45849525928497314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4874110221862793, + "epoch": 10.28, + "learning_rate": 3.5053843297437805e-05, + "loss": 0.3292, + "step": 4100, + "task_loss": 0.44578224420547485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.39064881205558777, + "epoch": 10.3, + "learning_rate": 3.486817675454883e-05, + "loss": 0.3842, + "step": 4110, + "task_loss": 0.34669435024261475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4172508716583252, + "epoch": 10.33, + "learning_rate": 3.4682510211659865e-05, + "loss": 0.385, + "step": 4120, + "task_loss": 0.37710291147232056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.2650163769721985, + "epoch": 10.35, + "learning_rate": 3.449684366877089e-05, + "loss": 0.359, + "step": 4130, + "task_loss": 0.10662341117858887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.07445116341114044, + "epoch": 10.38, + "learning_rate": 3.431117712588192e-05, + "loss": 0.355, + "step": 4140, + "task_loss": 0.035188257694244385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.6903338432312012, + "epoch": 10.4, + "learning_rate": 3.412551058299295e-05, + "loss": 0.3718, + "step": 4150, + "task_loss": 0.6073799729347229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.21276339888572693, + "epoch": 10.43, + "learning_rate": 3.393984404010397e-05, + "loss": 0.3869, + "step": 4160, + "task_loss": 0.10622841119766235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.2718493640422821, + "epoch": 10.45, + "learning_rate": 3.375417749721501e-05, + "loss": 0.3496, + "step": 4170, + "task_loss": 0.1441662311553955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.29152435064315796, + "epoch": 10.48, + "learning_rate": 3.356851095432603e-05, + "loss": 0.4076, + "step": 4180, + "task_loss": 0.1816622018814087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3583090901374817, + "epoch": 10.5, + "learning_rate": 3.338284441143706e-05, + "loss": 0.3713, + "step": 4190, + "task_loss": 0.1968095302581787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.24545052647590637, + "epoch": 10.53, + "learning_rate": 3.319717786854809e-05, + "loss": 0.33, + "step": 4200, + "task_loss": 0.17970728874206543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.35631152987480164, + "epoch": 10.55, + "learning_rate": 3.3011511325659114e-05, + "loss": 0.3431, + "step": 4210, + "task_loss": 0.22519803047180176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.19979014992713928, + "epoch": 10.58, + "learning_rate": 3.282584478277015e-05, + "loss": 0.3529, + "step": 4220, + "task_loss": 0.1249743103981018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4167667031288147, + "epoch": 10.6, + "learning_rate": 3.2640178239881173e-05, + "loss": 0.4016, + "step": 4230, + "task_loss": 0.25605911016464233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.22025595605373383, + "epoch": 10.63, + "learning_rate": 3.24545116969922e-05, + "loss": 0.3638, + "step": 4240, + "task_loss": 0.18087583780288696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.22751134634017944, + "epoch": 10.65, + "learning_rate": 3.226884515410323e-05, + "loss": 0.3196, + "step": 4250, + "task_loss": 0.1847636103630066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.07607132196426392, + "epoch": 10.68, + "learning_rate": 3.2083178611214256e-05, + "loss": 0.368, + "step": 4260, + "task_loss": 0.060303688049316406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.13270694017410278, + "epoch": 10.7, + "learning_rate": 3.189751206832529e-05, + "loss": 0.363, + "step": 4270, + "task_loss": 0.04249483346939087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.297863245010376, + "epoch": 10.73, + "learning_rate": 3.1711845525436316e-05, + "loss": 0.3602, + "step": 4280, + "task_loss": 0.23361259698867798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.5075054168701172, + "epoch": 10.75, + "learning_rate": 3.1526178982547346e-05, + "loss": 0.3169, + "step": 4290, + "task_loss": 0.32663124799728394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.2867015600204468, + "epoch": 10.78, + "learning_rate": 3.1340512439658376e-05, + "loss": 0.3851, + "step": 4300, + "task_loss": 0.1601826548576355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.168656587600708, + "epoch": 10.8, + "learning_rate": 3.1154845896769406e-05, + "loss": 0.3617, + "step": 4310, + "task_loss": 0.08980768918991089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.5464335680007935, + "epoch": 10.83, + "learning_rate": 3.0969179353880436e-05, + "loss": 0.4048, + "step": 4320, + "task_loss": 0.33633947372436523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.644969642162323, + "epoch": 10.85, + "learning_rate": 3.078351281099146e-05, + "loss": 0.3903, + "step": 4330, + "task_loss": 0.4570090174674988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.07465117424726486, + "epoch": 10.88, + "learning_rate": 3.059784626810249e-05, + "loss": 0.3063, + "step": 4340, + "task_loss": 0.05527830123901367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.2435240000486374, + "epoch": 10.9, + "learning_rate": 3.041217972521352e-05, + "loss": 0.354, + "step": 4350, + "task_loss": 0.13510406017303467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.07434059679508209, + "epoch": 10.93, + "learning_rate": 3.0226513182324545e-05, + "loss": 0.3235, + "step": 4360, + "task_loss": 0.1570771336555481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.6119037866592407, + "epoch": 10.95, + "learning_rate": 3.0040846639435578e-05, + "loss": 0.361, + "step": 4370, + "task_loss": 0.4033900499343872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.21598446369171143, + "epoch": 10.98, + "learning_rate": 2.9855180096546605e-05, + "loss": 0.3622, + "step": 4380, + "task_loss": 0.16614818572998047 + }, + { + "epoch": 11.0, + "eval_accuracy": 0.9645483965872316, + "eval_loss": 0.15833532810211182, + "eval_runtime": 32.1471, + "eval_samples_per_second": 211.465, + "eval_steps_per_second": 3.328, + "step": 4389 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4798199236392975, + "epoch": 11.0, + "learning_rate": 2.966951355365763e-05, + "loss": 0.3661, + "step": 4390, + "task_loss": 0.4825071096420288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.22367967665195465, + "epoch": 11.03, + "learning_rate": 2.948384701076866e-05, + "loss": 0.3392, + "step": 4400, + "task_loss": 0.10440462827682495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.42288827896118164, + "epoch": 11.05, + "learning_rate": 2.9298180467879687e-05, + "loss": 0.4004, + "step": 4410, + "task_loss": 0.2021128535270691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.374217689037323, + "epoch": 11.08, + "learning_rate": 2.911251392499072e-05, + "loss": 0.3452, + "step": 4420, + "task_loss": 0.2914271950721741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3371475636959076, + "epoch": 11.1, + "learning_rate": 2.8926847382101747e-05, + "loss": 0.3522, + "step": 4430, + "task_loss": 0.25141412019729614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.32083988189697266, + "epoch": 11.13, + "learning_rate": 2.8741180839212777e-05, + "loss": 0.349, + "step": 4440, + "task_loss": 0.19536364078521729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.19574186205863953, + "epoch": 11.15, + "learning_rate": 2.8555514296323804e-05, + "loss": 0.3199, + "step": 4450, + "task_loss": 0.3254619836807251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3210104703903198, + "epoch": 11.18, + "learning_rate": 2.836984775343483e-05, + "loss": 0.2991, + "step": 4460, + "task_loss": 0.246832013130188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3192119598388672, + "epoch": 11.2, + "learning_rate": 2.8184181210545863e-05, + "loss": 0.3292, + "step": 4470, + "task_loss": 0.33659565448760986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.48159974813461304, + "epoch": 11.23, + "learning_rate": 2.799851466765689e-05, + "loss": 0.3257, + "step": 4480, + "task_loss": 0.3302932381629944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3700810670852661, + "epoch": 11.25, + "learning_rate": 2.781284812476792e-05, + "loss": 0.3308, + "step": 4490, + "task_loss": 0.348072350025177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.9608942270278931, + "epoch": 11.28, + "learning_rate": 2.7627181581878946e-05, + "loss": 0.391, + "step": 4500, + "task_loss": 0.6507740020751953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.2407981902360916, + "epoch": 11.3, + "learning_rate": 2.7441515038989973e-05, + "loss": 0.3207, + "step": 4510, + "task_loss": 0.3147467374801636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4099271893501282, + "epoch": 11.33, + "learning_rate": 2.7255848496101006e-05, + "loss": 0.3287, + "step": 4520, + "task_loss": 0.29311758279800415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.13899505138397217, + "epoch": 11.35, + "learning_rate": 2.7070181953212032e-05, + "loss": 0.3213, + "step": 4530, + "task_loss": 0.05822998285293579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4563303589820862, + "epoch": 11.38, + "learning_rate": 2.6884515410323062e-05, + "loss": 0.3336, + "step": 4540, + "task_loss": 0.4114644527435303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.40684545040130615, + "epoch": 11.4, + "learning_rate": 2.669884886743409e-05, + "loss": 0.2897, + "step": 4550, + "task_loss": 0.19216406345367432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.03487290441989899, + "epoch": 11.43, + "learning_rate": 2.6513182324545115e-05, + "loss": 0.3023, + "step": 4560, + "task_loss": 0.01862436532974243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.23155973851680756, + "epoch": 11.45, + "learning_rate": 2.632751578165615e-05, + "loss": 0.3439, + "step": 4570, + "task_loss": 0.24344253540039062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.15806563198566437, + "epoch": 11.48, + "learning_rate": 2.6141849238767175e-05, + "loss": 0.3233, + "step": 4580, + "task_loss": 0.13851886987686157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.31244856119155884, + "epoch": 11.5, + "learning_rate": 2.5956182695878205e-05, + "loss": 0.3238, + "step": 4590, + "task_loss": 0.30514633655548096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4812265932559967, + "epoch": 11.53, + "learning_rate": 2.577051615298923e-05, + "loss": 0.3769, + "step": 4600, + "task_loss": 0.5851138830184937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.17942315340042114, + "epoch": 11.55, + "learning_rate": 2.5584849610100258e-05, + "loss": 0.3338, + "step": 4610, + "task_loss": 0.07811307907104492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.36539405584335327, + "epoch": 11.58, + "learning_rate": 2.539918306721129e-05, + "loss": 0.3219, + "step": 4620, + "task_loss": 0.24668723344802856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.25031495094299316, + "epoch": 11.6, + "learning_rate": 2.5213516524322318e-05, + "loss": 0.2863, + "step": 4630, + "task_loss": 0.21200621128082275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.10666746646165848, + "epoch": 11.63, + "learning_rate": 2.5027849981433347e-05, + "loss": 0.3188, + "step": 4640, + "task_loss": 0.09696447849273682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.5051015615463257, + "epoch": 11.65, + "learning_rate": 2.4842183438544374e-05, + "loss": 0.3786, + "step": 4650, + "task_loss": 0.3472709059715271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.25747379660606384, + "epoch": 11.68, + "learning_rate": 2.4656516895655404e-05, + "loss": 0.3005, + "step": 4660, + "task_loss": 0.21027618646621704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.422207236289978, + "epoch": 11.7, + "learning_rate": 2.4470850352766434e-05, + "loss": 0.2963, + "step": 4670, + "task_loss": 0.33926063776016235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3747057020664215, + "epoch": 11.73, + "learning_rate": 2.4285183809877463e-05, + "loss": 0.3769, + "step": 4680, + "task_loss": 0.28989529609680176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.21709272265434265, + "epoch": 11.75, + "learning_rate": 2.409951726698849e-05, + "loss": 0.3696, + "step": 4690, + "task_loss": 0.12343645095825195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3037072420120239, + "epoch": 11.78, + "learning_rate": 2.3913850724099516e-05, + "loss": 0.312, + "step": 4700, + "task_loss": 0.2279399037361145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3339011073112488, + "epoch": 11.8, + "learning_rate": 2.3728184181210546e-05, + "loss": 0.3394, + "step": 4710, + "task_loss": 0.1585477590560913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3525348901748657, + "epoch": 11.83, + "learning_rate": 2.3542517638321576e-05, + "loss": 0.3218, + "step": 4720, + "task_loss": 0.2891997694969177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4920140206813812, + "epoch": 11.85, + "learning_rate": 2.3356851095432606e-05, + "loss": 0.2796, + "step": 4730, + "task_loss": 0.32411789894104004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.5961865186691284, + "epoch": 11.88, + "learning_rate": 2.3171184552543633e-05, + "loss": 0.3377, + "step": 4740, + "task_loss": 0.4606587290763855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.09416650235652924, + "epoch": 11.9, + "learning_rate": 2.298551800965466e-05, + "loss": 0.2881, + "step": 4750, + "task_loss": 0.043773770332336426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.436837375164032, + "epoch": 11.93, + "learning_rate": 2.279985146676569e-05, + "loss": 0.3483, + "step": 4760, + "task_loss": 0.3754501938819885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.28128185868263245, + "epoch": 11.95, + "learning_rate": 2.261418492387672e-05, + "loss": 0.3037, + "step": 4770, + "task_loss": 0.16119801998138428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.2050139456987381, + "epoch": 11.98, + "learning_rate": 2.242851838098775e-05, + "loss": 0.2796, + "step": 4780, + "task_loss": 0.2673349976539612 + }, + { + "epoch": 12.0, + "eval_accuracy": 0.9666078258311268, + "eval_loss": 0.1419331282377243, + "eval_runtime": 36.9967, + "eval_samples_per_second": 183.746, + "eval_steps_per_second": 2.892, + "step": 4788 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3148767054080963, + "epoch": 12.01, + "learning_rate": 2.2242851838098775e-05, + "loss": 0.3433, + "step": 4790, + "task_loss": 0.2563563585281372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.13746681809425354, + "epoch": 12.03, + "learning_rate": 2.2057185295209805e-05, + "loss": 0.3039, + "step": 4800, + "task_loss": 0.34301137924194336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.22331731021404266, + "epoch": 12.06, + "learning_rate": 2.187151875232083e-05, + "loss": 0.3005, + "step": 4810, + "task_loss": 0.12315309047698975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.42207640409469604, + "epoch": 12.08, + "learning_rate": 2.168585220943186e-05, + "loss": 0.2839, + "step": 4820, + "task_loss": 0.3464387059211731 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.16519689559936523, + "epoch": 12.11, + "learning_rate": 2.150018566654289e-05, + "loss": 0.2698, + "step": 4830, + "task_loss": 0.10176759958267212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.6619377136230469, + "epoch": 12.13, + "learning_rate": 2.131451912365392e-05, + "loss": 0.319, + "step": 4840, + "task_loss": 0.5227985382080078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.09256790578365326, + "epoch": 12.16, + "learning_rate": 2.1128852580764948e-05, + "loss": 0.3268, + "step": 4850, + "task_loss": 0.14811944961547852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.44639626145362854, + "epoch": 12.18, + "learning_rate": 2.0943186037875974e-05, + "loss": 0.3439, + "step": 4860, + "task_loss": 0.3125450611114502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.44268080592155457, + "epoch": 12.21, + "learning_rate": 2.0757519494987004e-05, + "loss": 0.3251, + "step": 4870, + "task_loss": 0.2895811200141907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.1749984622001648, + "epoch": 12.23, + "learning_rate": 2.0571852952098034e-05, + "loss": 0.3112, + "step": 4880, + "task_loss": 0.07510066032409668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3095674514770508, + "epoch": 12.26, + "learning_rate": 2.0386186409209064e-05, + "loss": 0.2881, + "step": 4890, + "task_loss": 0.20059305429458618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.08122143149375916, + "epoch": 12.28, + "learning_rate": 2.020051986632009e-05, + "loss": 0.334, + "step": 4900, + "task_loss": 0.06281787157058716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.27408069372177124, + "epoch": 12.31, + "learning_rate": 2.0014853323431117e-05, + "loss": 0.3541, + "step": 4910, + "task_loss": 0.1118628978729248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.46010467410087585, + "epoch": 12.33, + "learning_rate": 1.9829186780542147e-05, + "loss": 0.3034, + "step": 4920, + "task_loss": 0.23461341857910156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3312876224517822, + "epoch": 12.36, + "learning_rate": 1.9643520237653176e-05, + "loss": 0.3146, + "step": 4930, + "task_loss": 0.21621811389923096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3772110641002655, + "epoch": 12.38, + "learning_rate": 1.9457853694764206e-05, + "loss": 0.3332, + "step": 4940, + "task_loss": 0.257324755191803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.9903969764709473, + "epoch": 12.41, + "learning_rate": 1.9272187151875233e-05, + "loss": 0.3188, + "step": 4950, + "task_loss": 0.6861466765403748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.6129430532455444, + "epoch": 12.43, + "learning_rate": 1.908652060898626e-05, + "loss": 0.2841, + "step": 4960, + "task_loss": 0.6236639022827148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3618336319923401, + "epoch": 12.46, + "learning_rate": 1.890085406609729e-05, + "loss": 0.3077, + "step": 4970, + "task_loss": 0.21517813205718994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3004649579524994, + "epoch": 12.48, + "learning_rate": 1.871518752320832e-05, + "loss": 0.3107, + "step": 4980, + "task_loss": 0.19500714540481567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4918174743652344, + "epoch": 12.51, + "learning_rate": 1.852952098031935e-05, + "loss": 0.3339, + "step": 4990, + "task_loss": 0.514573872089386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.5002132058143616, + "epoch": 12.53, + "learning_rate": 1.834385443743038e-05, + "loss": 0.3665, + "step": 5000, + "task_loss": 0.41887158155441284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.318847119808197, + "epoch": 12.56, + "learning_rate": 1.8158187894541405e-05, + "loss": 0.2721, + "step": 5010, + "task_loss": 0.40893304347991943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.2523074448108673, + "epoch": 12.58, + "learning_rate": 1.7972521351652432e-05, + "loss": 0.2476, + "step": 5020, + "task_loss": 0.16983669996261597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.325198233127594, + "epoch": 12.61, + "learning_rate": 1.778685480876346e-05, + "loss": 0.2931, + "step": 5030, + "task_loss": 0.26100659370422363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.31550291180610657, + "epoch": 12.63, + "learning_rate": 1.760118826587449e-05, + "loss": 0.2993, + "step": 5040, + "task_loss": 0.13907891511917114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.47811147570610046, + "epoch": 12.66, + "learning_rate": 1.741552172298552e-05, + "loss": 0.326, + "step": 5050, + "task_loss": 0.3417906165122986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.13587826490402222, + "epoch": 12.68, + "learning_rate": 1.7229855180096548e-05, + "loss": 0.2857, + "step": 5060, + "task_loss": 0.08225131034851074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.19688141345977783, + "epoch": 12.71, + "learning_rate": 1.7044188637207574e-05, + "loss": 0.3691, + "step": 5070, + "task_loss": 0.10089272260665894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.17473074793815613, + "epoch": 12.73, + "learning_rate": 1.6858522094318604e-05, + "loss": 0.3256, + "step": 5080, + "task_loss": 0.12312960624694824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.27181875705718994, + "epoch": 12.76, + "learning_rate": 1.6672855551429634e-05, + "loss": 0.3167, + "step": 5090, + "task_loss": 0.18777167797088623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.5065706372261047, + "epoch": 12.78, + "learning_rate": 1.6487189008540664e-05, + "loss": 0.3299, + "step": 5100, + "task_loss": 0.25197499990463257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.29144608974456787, + "epoch": 12.81, + "learning_rate": 1.630152246565169e-05, + "loss": 0.2697, + "step": 5110, + "task_loss": 0.3319002389907837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.12119048833847046, + "epoch": 12.83, + "learning_rate": 1.6115855922762717e-05, + "loss": 0.229, + "step": 5120, + "task_loss": 0.1036422848701477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.2655597925186157, + "epoch": 12.86, + "learning_rate": 1.5930189379873747e-05, + "loss": 0.3072, + "step": 5130, + "task_loss": 0.15660184621810913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.19651678204536438, + "epoch": 12.88, + "learning_rate": 1.5744522836984777e-05, + "loss": 0.3015, + "step": 5140, + "task_loss": 0.09583818912506104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.19852705299854279, + "epoch": 12.91, + "learning_rate": 1.5558856294095806e-05, + "loss": 0.3202, + "step": 5150, + "task_loss": 0.1014525294303894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.22350917756557465, + "epoch": 12.93, + "learning_rate": 1.5373189751206833e-05, + "loss": 0.2845, + "step": 5160, + "task_loss": 0.1737072467803955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.20863480865955353, + "epoch": 12.96, + "learning_rate": 1.5187523208317861e-05, + "loss": 0.3023, + "step": 5170, + "task_loss": 0.12738865613937378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.2705041766166687, + "epoch": 12.98, + "learning_rate": 1.500185666542889e-05, + "loss": 0.3157, + "step": 5180, + "task_loss": 0.2589300274848938 + }, + { + "epoch": 13.0, + "eval_accuracy": 0.9692556634304207, + "eval_loss": 0.13274797797203064, + "eval_runtime": 37.0354, + "eval_samples_per_second": 183.554, + "eval_steps_per_second": 2.889, + "step": 5187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.08910688757896423, + "epoch": 13.01, + "learning_rate": 1.481619012253992e-05, + "loss": 0.3307, + "step": 5190, + "task_loss": 0.19602680206298828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.37863457202911377, + "epoch": 13.03, + "learning_rate": 1.4630523579650947e-05, + "loss": 0.2522, + "step": 5200, + "task_loss": 0.18096846342086792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.5536918640136719, + "epoch": 13.06, + "learning_rate": 1.4444857036761977e-05, + "loss": 0.267, + "step": 5210, + "task_loss": 0.41513460874557495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.45684951543807983, + "epoch": 13.08, + "learning_rate": 1.4259190493873004e-05, + "loss": 0.3255, + "step": 5220, + "task_loss": 0.3368619680404663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.7224743366241455, + "epoch": 13.11, + "learning_rate": 1.4073523950984032e-05, + "loss": 0.3025, + "step": 5230, + "task_loss": 0.6985833048820496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.05073893070220947, + "epoch": 13.13, + "learning_rate": 1.3887857408095062e-05, + "loss": 0.2536, + "step": 5240, + "task_loss": 0.03904074430465698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.7553983330726624, + "epoch": 13.16, + "learning_rate": 1.3702190865206092e-05, + "loss": 0.2758, + "step": 5250, + "task_loss": 0.5351722836494446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3640548884868622, + "epoch": 13.18, + "learning_rate": 1.351652432231712e-05, + "loss": 0.2762, + "step": 5260, + "task_loss": 0.2816696763038635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.2268478274345398, + "epoch": 13.21, + "learning_rate": 1.3330857779428146e-05, + "loss": 0.3077, + "step": 5270, + "task_loss": 0.11782294511795044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3061811923980713, + "epoch": 13.23, + "learning_rate": 1.3145191236539176e-05, + "loss": 0.2583, + "step": 5280, + "task_loss": 0.2868692874908447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.14586792886257172, + "epoch": 13.26, + "learning_rate": 1.2959524693650204e-05, + "loss": 0.2976, + "step": 5290, + "task_loss": 0.11300593614578247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.20901861786842346, + "epoch": 13.28, + "learning_rate": 1.2773858150761234e-05, + "loss": 0.2911, + "step": 5300, + "task_loss": 0.17252373695373535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.24131226539611816, + "epoch": 13.31, + "learning_rate": 1.2588191607872262e-05, + "loss": 0.2741, + "step": 5310, + "task_loss": 0.213148832321167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.42875656485557556, + "epoch": 13.33, + "learning_rate": 1.240252506498329e-05, + "loss": 0.3124, + "step": 5320, + "task_loss": 0.594948410987854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4261111617088318, + "epoch": 13.36, + "learning_rate": 1.221685852209432e-05, + "loss": 0.3068, + "step": 5330, + "task_loss": 0.30504292249679565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.06669364869594574, + "epoch": 13.38, + "learning_rate": 1.2031191979205347e-05, + "loss": 0.2443, + "step": 5340, + "task_loss": 0.030248165130615234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4596603512763977, + "epoch": 13.41, + "learning_rate": 1.1845525436316377e-05, + "loss": 0.3056, + "step": 5350, + "task_loss": 0.3883788585662842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3064740300178528, + "epoch": 13.43, + "learning_rate": 1.1659858893427405e-05, + "loss": 0.3333, + "step": 5360, + "task_loss": 0.34499263763427734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.6286535859107971, + "epoch": 13.46, + "learning_rate": 1.1474192350538433e-05, + "loss": 0.2847, + "step": 5370, + "task_loss": 0.43789350986480713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.20650722086429596, + "epoch": 13.48, + "learning_rate": 1.1288525807649463e-05, + "loss": 0.2763, + "step": 5380, + "task_loss": 0.09722369909286499 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3732978105545044, + "epoch": 13.51, + "learning_rate": 1.110285926476049e-05, + "loss": 0.2971, + "step": 5390, + "task_loss": 0.28947287797927856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3377797603607178, + "epoch": 13.53, + "learning_rate": 1.091719272187152e-05, + "loss": 0.3019, + "step": 5400, + "task_loss": 0.23953330516815186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.39829933643341064, + "epoch": 13.56, + "learning_rate": 1.0731526178982548e-05, + "loss": 0.2821, + "step": 5410, + "task_loss": 0.11541533470153809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.32150202989578247, + "epoch": 13.58, + "learning_rate": 1.0545859636093576e-05, + "loss": 0.284, + "step": 5420, + "task_loss": 0.2544916868209839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3417925238609314, + "epoch": 13.61, + "learning_rate": 1.0360193093204606e-05, + "loss": 0.287, + "step": 5430, + "task_loss": 0.3315048813819885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.04803985357284546, + "epoch": 13.63, + "learning_rate": 1.0174526550315634e-05, + "loss": 0.2495, + "step": 5440, + "task_loss": 0.03251296281814575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4633942246437073, + "epoch": 13.66, + "learning_rate": 9.988860007426662e-06, + "loss": 0.2516, + "step": 5450, + "task_loss": 0.2955498695373535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3851782977581024, + "epoch": 13.68, + "learning_rate": 9.803193464537692e-06, + "loss": 0.3122, + "step": 5460, + "task_loss": 0.30496662855148315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3153489828109741, + "epoch": 13.71, + "learning_rate": 9.617526921648718e-06, + "loss": 0.3009, + "step": 5470, + "task_loss": 0.25778961181640625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.05777604132890701, + "epoch": 13.73, + "learning_rate": 9.431860378759748e-06, + "loss": 0.2773, + "step": 5480, + "task_loss": 0.27185744047164917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.06326507776975632, + "epoch": 13.76, + "learning_rate": 9.246193835870776e-06, + "loss": 0.2401, + "step": 5490, + "task_loss": 0.04295825958251953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.405191034078598, + "epoch": 13.78, + "learning_rate": 9.060527292981805e-06, + "loss": 0.2551, + "step": 5500, + "task_loss": 0.3005957007408142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4162152409553528, + "epoch": 13.81, + "learning_rate": 8.874860750092834e-06, + "loss": 0.2403, + "step": 5510, + "task_loss": 0.24527263641357422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.38197383284568787, + "epoch": 13.83, + "learning_rate": 8.689194207203863e-06, + "loss": 0.2401, + "step": 5520, + "task_loss": 0.20181572437286377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.5913758277893066, + "epoch": 13.86, + "learning_rate": 8.50352766431489e-06, + "loss": 0.2646, + "step": 5530, + "task_loss": 0.6561472415924072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.0848148837685585, + "epoch": 13.88, + "learning_rate": 8.317861121425919e-06, + "loss": 0.2796, + "step": 5540, + "task_loss": 0.04874962568283081 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.38164085149765015, + "epoch": 13.91, + "learning_rate": 8.132194578536947e-06, + "loss": 0.2736, + "step": 5550, + "task_loss": 0.23838627338409424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.24937883019447327, + "epoch": 13.93, + "learning_rate": 7.946528035647977e-06, + "loss": 0.2817, + "step": 5560, + "task_loss": 0.21357494592666626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.18074730038642883, + "epoch": 13.96, + "learning_rate": 7.760861492759005e-06, + "loss": 0.2821, + "step": 5570, + "task_loss": 0.21641606092453003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.0911644771695137, + "epoch": 13.98, + "learning_rate": 7.575194949870033e-06, + "loss": 0.2997, + "step": 5580, + "task_loss": 0.04680430889129639 + }, + { + "epoch": 14.0, + "eval_accuracy": 0.9694027655192704, + "eval_loss": 0.1263468861579895, + "eval_runtime": 36.9436, + "eval_samples_per_second": 184.01, + "eval_steps_per_second": 2.896, + "step": 5586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.18836095929145813, + "epoch": 14.01, + "learning_rate": 7.389528406981062e-06, + "loss": 0.2874, + "step": 5590, + "task_loss": 0.0300523042678833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.2339984029531479, + "epoch": 14.04, + "learning_rate": 7.2038618640920914e-06, + "loss": 0.2845, + "step": 5600, + "task_loss": 0.0838119387626648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.31754040718078613, + "epoch": 14.06, + "learning_rate": 7.01819532120312e-06, + "loss": 0.2967, + "step": 5610, + "task_loss": 0.506534218788147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.34850573539733887, + "epoch": 14.09, + "learning_rate": 6.832528778314149e-06, + "loss": 0.3116, + "step": 5620, + "task_loss": 0.28055644035339355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.38626885414123535, + "epoch": 14.11, + "learning_rate": 6.646862235425176e-06, + "loss": 0.3022, + "step": 5630, + "task_loss": 0.1729462742805481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.21166260540485382, + "epoch": 14.14, + "learning_rate": 6.461195692536205e-06, + "loss": 0.2836, + "step": 5640, + "task_loss": 0.1679213047027588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.09462715685367584, + "epoch": 14.16, + "learning_rate": 6.275529149647234e-06, + "loss": 0.2914, + "step": 5650, + "task_loss": 0.038580119609832764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.11994849145412445, + "epoch": 14.19, + "learning_rate": 6.089862606758263e-06, + "loss": 0.3024, + "step": 5660, + "task_loss": 0.06043201684951782 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3605304956436157, + "epoch": 14.21, + "learning_rate": 5.904196063869291e-06, + "loss": 0.3403, + "step": 5670, + "task_loss": 0.28026604652404785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.23137590289115906, + "epoch": 14.24, + "learning_rate": 5.718529520980319e-06, + "loss": 0.2933, + "step": 5680, + "task_loss": 0.15439879894256592 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.6911032199859619, + "epoch": 14.26, + "learning_rate": 5.532862978091348e-06, + "loss": 0.2918, + "step": 5690, + "task_loss": 0.5811224579811096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.1986621767282486, + "epoch": 14.29, + "learning_rate": 5.3471964352023775e-06, + "loss": 0.2749, + "step": 5700, + "task_loss": 0.11945277452468872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4699419140815735, + "epoch": 14.31, + "learning_rate": 5.161529892313406e-06, + "loss": 0.287, + "step": 5710, + "task_loss": 0.19817966222763062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.08489257097244263, + "epoch": 14.34, + "learning_rate": 4.975863349424434e-06, + "loss": 0.2829, + "step": 5720, + "task_loss": 0.15260469913482666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4078882932662964, + "epoch": 14.36, + "learning_rate": 4.790196806535462e-06, + "loss": 0.3051, + "step": 5730, + "task_loss": 0.43970197439193726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.3651973009109497, + "epoch": 14.39, + "learning_rate": 4.604530263646491e-06, + "loss": 0.2862, + "step": 5740, + "task_loss": 0.2666727900505066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.07422187924385071, + "epoch": 14.41, + "learning_rate": 4.41886372075752e-06, + "loss": 0.2622, + "step": 5750, + "task_loss": 0.10315924882888794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.17544104158878326, + "epoch": 14.44, + "learning_rate": 4.233197177868548e-06, + "loss": 0.2451, + "step": 5760, + "task_loss": 0.1157999038696289 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.012510798871517181, + "epoch": 14.46, + "learning_rate": 4.047530634979576e-06, + "loss": 0.2755, + "step": 5770, + "task_loss": 0.019283294677734375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4852864444255829, + "epoch": 14.49, + "learning_rate": 3.8618640920906054e-06, + "loss": 0.2649, + "step": 5780, + "task_loss": 0.2382746934890747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.38700103759765625, + "epoch": 14.51, + "learning_rate": 3.676197549201634e-06, + "loss": 0.2688, + "step": 5790, + "task_loss": 0.22405141592025757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.352966845035553, + "epoch": 14.54, + "learning_rate": 3.4905310063126626e-06, + "loss": 0.3237, + "step": 5800, + "task_loss": 0.2017972469329834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.15554654598236084, + "epoch": 14.56, + "learning_rate": 3.304864463423691e-06, + "loss": 0.2696, + "step": 5810, + "task_loss": 0.22360849380493164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.125523641705513, + "epoch": 14.59, + "learning_rate": 3.11919792053472e-06, + "loss": 0.2602, + "step": 5820, + "task_loss": 0.05676722526550293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.27929943799972534, + "epoch": 14.61, + "learning_rate": 2.9335313776457484e-06, + "loss": 0.2747, + "step": 5830, + "task_loss": 0.23685741424560547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.1598871797323227, + "epoch": 14.64, + "learning_rate": 2.747864834756777e-06, + "loss": 0.2353, + "step": 5840, + "task_loss": 0.06815314292907715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.1308247447013855, + "epoch": 14.66, + "learning_rate": 2.5621982918678056e-06, + "loss": 0.2702, + "step": 5850, + "task_loss": 0.2573508620262146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.08836422860622406, + "epoch": 14.69, + "learning_rate": 2.3765317489788342e-06, + "loss": 0.278, + "step": 5860, + "task_loss": 0.26388323307037354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.0640711635351181, + "epoch": 14.71, + "learning_rate": 2.190865206089863e-06, + "loss": 0.3073, + "step": 5870, + "task_loss": 0.04650157690048218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.26123324036598206, + "epoch": 14.74, + "learning_rate": 2.005198663200891e-06, + "loss": 0.3366, + "step": 5880, + "task_loss": 0.18056988716125488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4510636329650879, + "epoch": 14.76, + "learning_rate": 1.81953212031192e-06, + "loss": 0.263, + "step": 5890, + "task_loss": 0.2670378088951111 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.2907771170139313, + "epoch": 14.79, + "learning_rate": 1.6338655774229484e-06, + "loss": 0.2823, + "step": 5900, + "task_loss": 0.1563488245010376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.6017012596130371, + "epoch": 14.81, + "learning_rate": 1.448199034533977e-06, + "loss": 0.2734, + "step": 5910, + "task_loss": 0.3943854570388794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.11146201193332672, + "epoch": 14.84, + "learning_rate": 1.2625324916450056e-06, + "loss": 0.2477, + "step": 5920, + "task_loss": 0.05293494462966919 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.4173949956893921, + "epoch": 14.86, + "learning_rate": 1.0768659487560342e-06, + "loss": 0.2609, + "step": 5930, + "task_loss": 0.31466907262802124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.45049095153808594, + "epoch": 14.89, + "learning_rate": 8.911994058670627e-07, + "loss": 0.2444, + "step": 5940, + "task_loss": 0.3486248254776001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.16688336431980133, + "epoch": 14.91, + "learning_rate": 7.055328629780914e-07, + "loss": 0.2772, + "step": 5950, + "task_loss": 0.20320415496826172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.6928185224533081, + "epoch": 14.94, + "learning_rate": 5.198663200891199e-07, + "loss": 0.276, + "step": 5960, + "task_loss": 0.4054552912712097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.09353074431419373, + "epoch": 14.96, + "learning_rate": 3.341997772001486e-07, + "loss": 0.2461, + "step": 5970, + "task_loss": 0.04078972339630127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.8272228926716351, + "compression/movement_sparsity/model_sparsity": 0.7436530986132632, + "compression_loss": 0.0, + "distillation_loss": 0.06055627763271332, + "epoch": 14.99, + "learning_rate": 1.4853323431117714e-07, + "loss": 0.2667, + "step": 5980, + "task_loss": 0.15467888116836548 + }, + { + "epoch": 15.0, + "eval_accuracy": 0.96954986760812, + "eval_loss": 0.1227947399020195, + "eval_runtime": 36.9368, + "eval_samples_per_second": 184.044, + "eval_steps_per_second": 2.897, + "step": 5985 + }, + { + "epoch": 15.0, + "step": 5985, + "total_flos": 6.974474558739075e+18, + "train_loss": 7.481976795455467, + "train_runtime": 12266.9504, + "train_samples_per_second": 62.478, + "train_steps_per_second": 0.488 + } + ], + "max_steps": 5985, + "num_train_epochs": 15, + "total_flos": 6.974474558739075e+18, + "trial_name": null, + "trial_params": null +}