diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6347 @@ +{ + "best_metric": 0.9794057075610474, + "best_model_checkpoint": "/nvme2/yujiepan/workspace/jpqd-test/playground/optimum-playground/0314.example-rerun/logs/w2v2-ks-jpqd-quant-FE-finetuned-student/checkpoint-4788", + "epoch": 11.999373825923607, + "global_step": 4788, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.51446533203125, + "epoch": 0.03, + "learning_rate": 8.771929824561403e-08, + "loss": 0.9515, + "step": 10, + "task_loss": 0.9478130340576172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.1920084953308105, + "epoch": 0.05, + "learning_rate": 3.2163742690058475e-07, + "loss": 0.9499, + "step": 20, + "task_loss": 1.1256299018859863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.5554466247558594, + "epoch": 0.08, + "learning_rate": 6.140350877192981e-07, + "loss": 0.8458, + "step": 30, + "task_loss": 0.8432578444480896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1300909519195557, + "epoch": 0.1, + "learning_rate": 8.771929824561403e-07, + "loss": 0.7546, + "step": 40, + "task_loss": 0.6580129861831665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2838783264160156, + "epoch": 0.13, + "learning_rate": 1.1695906432748535e-06, + "loss": 0.7662, + "step": 50, + "task_loss": 0.6450424194335938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9236965775489807, + "epoch": 0.15, + "learning_rate": 1.4619883040935671e-06, + "loss": 0.6895, + "step": 60, + "task_loss": 0.4147263765335083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.3553576469421387, + "epoch": 0.18, + "learning_rate": 1.7543859649122805e-06, + "loss": 0.5607, + "step": 70, + "task_loss": 0.5525619983673096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.6165952682495117, + "epoch": 0.2, + "learning_rate": 2.046783625730994e-06, + "loss": 0.6068, + "step": 80, + "task_loss": 0.8984407782554626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.6254358291625977, + "epoch": 0.23, + "learning_rate": 2.339181286549707e-06, + "loss": 0.5856, + "step": 90, + "task_loss": 0.8998498916625977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6450754404067993, + "epoch": 0.25, + "learning_rate": 2.6315789473684207e-06, + "loss": 0.6321, + "step": 100, + "task_loss": 0.2010144591331482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7683451175689697, + "epoch": 0.28, + "learning_rate": 2.9239766081871343e-06, + "loss": 0.5395, + "step": 110, + "task_loss": 0.3388332724571228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49454671144485474, + "epoch": 0.3, + "learning_rate": 3.2163742690058475e-06, + "loss": 0.5699, + "step": 120, + "task_loss": 0.35037726163864136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6786686182022095, + "epoch": 0.33, + "learning_rate": 3.508771929824561e-06, + "loss": 0.559, + "step": 130, + "task_loss": 0.510002076625824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5671663284301758, + "epoch": 0.35, + "learning_rate": 3.8011695906432742e-06, + "loss": 0.5609, + "step": 140, + "task_loss": 0.29252851009368896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5783287882804871, + "epoch": 0.38, + "learning_rate": 4.093567251461988e-06, + "loss": 0.4257, + "step": 150, + "task_loss": 0.32213056087493896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8722546100616455, + "epoch": 0.4, + "learning_rate": 4.3859649122807014e-06, + "loss": 0.4616, + "step": 160, + "task_loss": 0.5493186116218567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5747439861297607, + "epoch": 0.43, + "learning_rate": 4.678362573099414e-06, + "loss": 0.459, + "step": 170, + "task_loss": 0.26550352573394775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.16851046681404114, + "epoch": 0.45, + "learning_rate": 4.970760233918128e-06, + "loss": 0.4353, + "step": 180, + "task_loss": 0.08923470973968506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2978078126907349, + "epoch": 0.48, + "learning_rate": 5.263157894736841e-06, + "loss": 0.4977, + "step": 190, + "task_loss": 0.817858099937439 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8489176034927368, + "epoch": 0.5, + "learning_rate": 5.555555555555555e-06, + "loss": 0.5026, + "step": 200, + "task_loss": 0.4384632706642151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7206552028656006, + "epoch": 0.53, + "learning_rate": 5.8479532163742686e-06, + "loss": 0.4572, + "step": 210, + "task_loss": 0.3212318420410156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8086921572685242, + "epoch": 0.55, + "learning_rate": 6.140350877192981e-06, + "loss": 0.4925, + "step": 220, + "task_loss": 0.447459876537323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8657560348510742, + "epoch": 0.58, + "learning_rate": 6.432748538011695e-06, + "loss": 0.4788, + "step": 230, + "task_loss": 0.5898361206054688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5544594526290894, + "epoch": 0.6, + "learning_rate": 6.7251461988304085e-06, + "loss": 0.442, + "step": 240, + "task_loss": 0.34924864768981934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7391929626464844, + "epoch": 0.63, + "learning_rate": 7.017543859649122e-06, + "loss": 0.4101, + "step": 250, + "task_loss": 0.40177232027053833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.812840461730957, + "epoch": 0.65, + "learning_rate": 7.309941520467835e-06, + "loss": 0.4813, + "step": 260, + "task_loss": 0.5525591373443604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.3050377368927002, + "epoch": 0.68, + "learning_rate": 7.6023391812865485e-06, + "loss": 0.4787, + "step": 270, + "task_loss": 0.9039720892906189 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.47748178243637085, + "epoch": 0.7, + "learning_rate": 7.894736842105261e-06, + "loss": 0.412, + "step": 280, + "task_loss": 0.18844187259674072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7705972194671631, + "epoch": 0.73, + "learning_rate": 8.187134502923976e-06, + "loss": 0.4226, + "step": 290, + "task_loss": 0.4612080454826355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4262794852256775, + "epoch": 0.75, + "learning_rate": 8.479532163742688e-06, + "loss": 0.4198, + "step": 300, + "task_loss": 0.3151981234550476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5648545026779175, + "epoch": 0.78, + "learning_rate": 8.771929824561403e-06, + "loss": 0.3941, + "step": 310, + "task_loss": 0.4206511378288269 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.44185498356819153, + "epoch": 0.8, + "learning_rate": 9.064327485380116e-06, + "loss": 0.4256, + "step": 320, + "task_loss": 0.24285316467285156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5537223815917969, + "epoch": 0.83, + "learning_rate": 9.356725146198828e-06, + "loss": 0.4058, + "step": 330, + "task_loss": 0.21424394845962524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45310112833976746, + "epoch": 0.85, + "learning_rate": 9.649122807017543e-06, + "loss": 0.3847, + "step": 340, + "task_loss": 0.16723406314849854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8928914070129395, + "epoch": 0.88, + "learning_rate": 9.941520467836256e-06, + "loss": 0.353, + "step": 350, + "task_loss": 0.5004438161849976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.086942434310913, + "epoch": 0.9, + "learning_rate": 1.023391812865497e-05, + "loss": 0.3981, + "step": 360, + "task_loss": 0.4962713122367859 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.283523827791214, + "epoch": 0.93, + "learning_rate": 1.0526315789473683e-05, + "loss": 0.3131, + "step": 370, + "task_loss": 0.3023862838745117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.72919762134552, + "epoch": 0.95, + "learning_rate": 1.0818713450292396e-05, + "loss": 0.3519, + "step": 380, + "task_loss": 0.3908747434616089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3192288875579834, + "epoch": 0.98, + "learning_rate": 1.111111111111111e-05, + "loss": 0.3477, + "step": 390, + "task_loss": 0.28754663467407227 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.9636657840541336, + "eval_loss": 0.15156690776348114, + "eval_runtime": 116.1773, + "eval_samples_per_second": 58.514, + "eval_steps_per_second": 1.833, + "step": 399 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -0.0003772132040467113, + "compression/movement_sparsity/linear_layer_sparsity": 0.0010102966915085817, + "compression/movement_sparsity/model_sparsity": 0.0009082319164113224, + "compression_loss": 0.0, + "distillation_loss": 0.35058850049972534, + "epoch": 1.0, + "learning_rate": 1.1403508771929823e-05, + "loss": 0.3485, + "step": 400, + "task_loss": 0.1688241958618164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0005970049999999993, + "compression/movement_sparsity/importance_threshold": -0.00037158324982466365, + "compression/movement_sparsity/linear_layer_sparsity": 0.0012727599932249322, + "compression/movement_sparsity/model_sparsity": 0.0011441799794991438, + "compression_loss": 0.16240963339805603, + "distillation_loss": 0.4725266396999359, + "epoch": 1.03, + "learning_rate": 1.1695906432748537e-05, + "loss": 0.433, + "step": 410, + "task_loss": 0.288402259349823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0011880399999999991, + "compression/movement_sparsity/importance_threshold": -0.00036600959467331996, + "compression/movement_sparsity/linear_layer_sparsity": 0.0019467616117133392, + "compression/movement_sparsity/model_sparsity": 0.001750090883463397, + "compression_loss": 0.32319432497024536, + "distillation_loss": 0.3679881989955902, + "epoch": 1.05, + "learning_rate": 1.198830409356725e-05, + "loss": 0.5607, + "step": 420, + "task_loss": 0.1445859670639038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0017731350000000036, + "compression/movement_sparsity/importance_threshold": -0.00036049195568277717, + "compression/movement_sparsity/linear_layer_sparsity": 0.0026285851400180667, + "compression/movement_sparsity/model_sparsity": 0.0023630334922745354, + "compression_loss": 0.4823610484600067, + "distillation_loss": 0.6154017448425293, + "epoch": 1.08, + "learning_rate": 1.2280701754385963e-05, + "loss": 0.7495, + "step": 430, + "task_loss": 0.3742516040802002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0023523200000000032, + "compression/movement_sparsity/importance_threshold": -0.0003550300499431323, + "compression/movement_sparsity/linear_layer_sparsity": 0.0037378260501355013, + "compression/movement_sparsity/model_sparsity": 0.003360213831500896, + "compression_loss": 0.6399164199829102, + "distillation_loss": 0.7374873757362366, + "epoch": 1.1, + "learning_rate": 1.2573099415204677e-05, + "loss": 0.9126, + "step": 440, + "task_loss": 0.37421369552612305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.002925625000000003, + "compression/movement_sparsity/importance_threshold": -0.0003496235945444823, + "compression/movement_sparsity/linear_layer_sparsity": 0.00737426132941885, + "compression/movement_sparsity/model_sparsity": 0.0066292798497985545, + "compression_loss": 0.7958664298057556, + "distillation_loss": 0.6916153430938721, + "epoch": 1.13, + "learning_rate": 1.286549707602339e-05, + "loss": 1.1032, + "step": 450, + "task_loss": 0.35443025827407837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0034930800000000017, + "compression/movement_sparsity/importance_threshold": -0.0003442723065769242, + "compression/movement_sparsity/linear_layer_sparsity": 0.014844843891147244, + "compression/movement_sparsity/model_sparsity": 0.013345150122140185, + "compression_loss": 0.9502197504043579, + "distillation_loss": 0.25516602396965027, + "epoch": 1.15, + "learning_rate": 1.3157894736842103e-05, + "loss": 1.2696, + "step": 460, + "task_loss": 0.12397211790084839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.004054715000000005, + "compression/movement_sparsity/importance_threshold": -0.00033897590313055476, + "compression/movement_sparsity/linear_layer_sparsity": 0.025417325353809094, + "compression/movement_sparsity/model_sparsity": 0.0228495513349348, + "compression_loss": 1.102981686592102, + "distillation_loss": 0.7275341749191284, + "epoch": 1.18, + "learning_rate": 1.3450292397660817e-05, + "loss": 1.4399, + "step": 470, + "task_loss": 0.28879886865615845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.004610560000000006, + "compression/movement_sparsity/importance_threshold": -0.0003337341012954711, + "compression/movement_sparsity/linear_layer_sparsity": 0.03841998597937368, + "compression/movement_sparsity/model_sparsity": 0.03453862393871483, + "compression_loss": 1.2541608810424805, + "distillation_loss": 0.3953809142112732, + "epoch": 1.2, + "learning_rate": 1.374269005847953e-05, + "loss": 1.5474, + "step": 480, + "task_loss": 0.30661720037460327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.005160645000000006, + "compression/movement_sparsity/importance_threshold": -0.00032854661816177023, + "compression/movement_sparsity/linear_layer_sparsity": 0.05003314607798856, + "compression/movement_sparsity/model_sparsity": 0.04497856968990499, + "compression_loss": 1.403770089149475, + "distillation_loss": 0.14313216507434845, + "epoch": 1.23, + "learning_rate": 1.4035087719298244e-05, + "loss": 1.7254, + "step": 490, + "task_loss": 0.15733814239501953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.005705000000000004, + "compression/movement_sparsity/importance_threshold": -0.0003234131708195491, + "compression/movement_sparsity/linear_layer_sparsity": 0.060780132584311954, + "compression/movement_sparsity/model_sparsity": 0.05463984665173478, + "compression_loss": 1.5518090724945068, + "distillation_loss": 0.3677009344100952, + "epoch": 1.25, + "learning_rate": 1.4327485380116957e-05, + "loss": 1.8058, + "step": 500, + "task_loss": 0.3658151626586914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.006243655000000006, + "compression/movement_sparsity/importance_threshold": -0.00031833347635890454, + "compression/movement_sparsity/linear_layer_sparsity": 0.07141967075052695, + "compression/movement_sparsity/model_sparsity": 0.06420453019435209, + "compression_loss": 1.698294758796692, + "distillation_loss": 0.1806531548500061, + "epoch": 1.28, + "learning_rate": 1.461988304093567e-05, + "loss": 2.0346, + "step": 510, + "task_loss": 0.17051905393600464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.006776640000000005, + "compression/movement_sparsity/importance_threshold": -0.00031330725186993363, + "compression/movement_sparsity/linear_layer_sparsity": 0.08092245605615779, + "compression/movement_sparsity/model_sparsity": 0.07274730082986806, + "compression_loss": 1.8432351350784302, + "distillation_loss": 0.8070929050445557, + "epoch": 1.3, + "learning_rate": 1.4912280701754384e-05, + "loss": 2.1974, + "step": 520, + "task_loss": 0.42286455631256104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.007303984999999993, + "compression/movement_sparsity/importance_threshold": -0.00030833421444273343, + "compression/movement_sparsity/linear_layer_sparsity": 0.09169531955736224, + "compression/movement_sparsity/model_sparsity": 0.08243184057464996, + "compression_loss": 1.9866251945495605, + "distillation_loss": 0.5600777864456177, + "epoch": 1.33, + "learning_rate": 1.5204678362573097e-05, + "loss": 2.2739, + "step": 530, + "task_loss": 0.1994045376777649 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.007825719999999996, + "compression/movement_sparsity/importance_threshold": -0.0003034140811674006, + "compression/movement_sparsity/linear_layer_sparsity": 0.102509927356218, + "compression/movement_sparsity/model_sparsity": 0.09215390741793049, + "compression_loss": 2.1284844875335693, + "distillation_loss": 0.6648948192596436, + "epoch": 1.35, + "learning_rate": 1.549707602339181e-05, + "loss": 2.4111, + "step": 540, + "task_loss": 0.397721529006958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.008341874999999997, + "compression/movement_sparsity/importance_threshold": -0.00029854656913403234, + "compression/movement_sparsity/linear_layer_sparsity": 0.11246536011367059, + "compression/movement_sparsity/model_sparsity": 0.10110359699723998, + "compression_loss": 2.268819808959961, + "distillation_loss": 0.5570085644721985, + "epoch": 1.38, + "learning_rate": 1.5789473684210522e-05, + "loss": 2.5594, + "step": 550, + "task_loss": 0.24902677536010742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.008852479999999998, + "compression/movement_sparsity/importance_threshold": -0.00029373139543272556, + "compression/movement_sparsity/linear_layer_sparsity": 0.12301906899277326, + "compression/movement_sparsity/model_sparsity": 0.11059112211840207, + "compression_loss": 2.4076294898986816, + "distillation_loss": 1.1394556760787964, + "epoch": 1.4, + "learning_rate": 1.608187134502924e-05, + "loss": 2.661, + "step": 560, + "task_loss": 0.47712135314941406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.009357564999999997, + "compression/movement_sparsity/importance_threshold": -0.00028896827715357723, + "compression/movement_sparsity/linear_layer_sparsity": 0.1357961175097862, + "compression/movement_sparsity/model_sparsity": 0.12207737497681652, + "compression_loss": 2.54492449760437, + "distillation_loss": 0.7886945009231567, + "epoch": 1.43, + "learning_rate": 1.637426900584795e-05, + "loss": 2.8728, + "step": 570, + "task_loss": 0.5190638899803162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.009857159999999997, + "compression/movement_sparsity/importance_threshold": -0.00028425693138668434, + "compression/movement_sparsity/linear_layer_sparsity": 0.1511111816847335, + "compression/movement_sparsity/model_sparsity": 0.13584524158717315, + "compression_loss": 2.6807024478912354, + "distillation_loss": 0.5934693813323975, + "epoch": 1.45, + "learning_rate": 1.6666666666666664e-05, + "loss": 3.0271, + "step": 580, + "task_loss": 0.43770378828048706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.010351294999999996, + "compression/movement_sparsity/importance_threshold": -0.0002795970752221438, + "compression/movement_sparsity/linear_layer_sparsity": 0.17006478658536586, + "compression/movement_sparsity/model_sparsity": 0.15288406696044038, + "compression_loss": 2.814985752105713, + "distillation_loss": 0.4786364436149597, + "epoch": 1.48, + "learning_rate": 1.6959064327485377e-05, + "loss": 3.2229, + "step": 590, + "task_loss": 0.19185221195220947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.010839999999999997, + "compression/movement_sparsity/importance_threshold": -0.0002749884257500526, + "compression/movement_sparsity/linear_layer_sparsity": 0.18809244203553147, + "compression/movement_sparsity/model_sparsity": 0.16909048651572786, + "compression_loss": 2.9477694034576416, + "distillation_loss": 0.6082563400268555, + "epoch": 1.5, + "learning_rate": 1.725146198830409e-05, + "loss": 3.4037, + "step": 600, + "task_loss": 0.20349204540252686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.011323304999999997, + "compression/movement_sparsity/importance_threshold": -0.0002704307000605077, + "compression/movement_sparsity/linear_layer_sparsity": 0.20836323302469137, + "compression/movement_sparsity/model_sparsity": 0.18731342983722607, + "compression_loss": 3.0790719985961914, + "distillation_loss": 1.0272729396820068, + "epoch": 1.53, + "learning_rate": 1.7543859649122806e-05, + "loss": 3.4945, + "step": 610, + "task_loss": 0.4551582932472229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.011801239999999998, + "compression/movement_sparsity/importance_threshold": -0.00026592361524360606, + "compression/movement_sparsity/linear_layer_sparsity": 0.22696513427807888, + "compression/movement_sparsity/model_sparsity": 0.20403608226819747, + "compression_loss": 3.2089145183563232, + "distillation_loss": 1.0786527395248413, + "epoch": 1.55, + "learning_rate": 1.783625730994152e-05, + "loss": 3.7627, + "step": 620, + "task_loss": 0.6676000356674194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.012273834999999997, + "compression/movement_sparsity/importance_threshold": -0.00026146688838944466, + "compression/movement_sparsity/linear_layer_sparsity": 0.24483187010689553, + "compression/movement_sparsity/model_sparsity": 0.22009783903549965, + "compression_loss": 3.337289333343506, + "distillation_loss": 0.9446437358856201, + "epoch": 1.58, + "learning_rate": 1.812865497076023e-05, + "loss": 3.8192, + "step": 630, + "task_loss": 0.5477078557014465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.012741120000000002, + "compression/movement_sparsity/importance_threshold": -0.00025706023658812044, + "compression/movement_sparsity/linear_layer_sparsity": 0.26339174476814214, + "compression/movement_sparsity/model_sparsity": 0.23678271059215836, + "compression_loss": 3.4642162322998047, + "distillation_loss": 0.8719321489334106, + "epoch": 1.6, + "learning_rate": 1.8421052631578944e-05, + "loss": 3.9186, + "step": 640, + "task_loss": 0.6353597044944763 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.013203125, + "compression/movement_sparsity/importance_threshold": -0.00025270337692973044, + "compression/movement_sparsity/linear_layer_sparsity": 0.2798236835666968, + "compression/movement_sparsity/model_sparsity": 0.25155462006271223, + "compression_loss": 3.58969783782959, + "distillation_loss": 1.3224390745162964, + "epoch": 1.63, + "learning_rate": 1.8713450292397657e-05, + "loss": 3.9562, + "step": 650, + "task_loss": 0.880706250667572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.013659880000000003, + "compression/movement_sparsity/importance_threshold": -0.0002483960265043715, + "compression/movement_sparsity/linear_layer_sparsity": 0.29560894450090336, + "compression/movement_sparsity/model_sparsity": 0.26574518201330083, + "compression_loss": 3.713754892349243, + "distillation_loss": 0.628543496131897, + "epoch": 1.65, + "learning_rate": 1.9005847953216373e-05, + "loss": 4.1618, + "step": 660, + "task_loss": 0.21685028076171875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.014111415000000002, + "compression/movement_sparsity/importance_threshold": -0.00024413790240214073, + "compression/movement_sparsity/linear_layer_sparsity": 0.3098370102190605, + "compression/movement_sparsity/model_sparsity": 0.27853586370377764, + "compression_loss": 3.836373805999756, + "distillation_loss": 0.18178164958953857, + "epoch": 1.68, + "learning_rate": 1.9298245614035086e-05, + "loss": 4.2838, + "step": 670, + "task_loss": 0.11985671520233154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.014557760000000003, + "compression/movement_sparsity/importance_threshold": -0.000239928721713135, + "compression/movement_sparsity/linear_layer_sparsity": 0.32408217827838, + "compression/movement_sparsity/model_sparsity": 0.29134191997898734, + "compression_loss": 3.957580804824829, + "distillation_loss": 0.7838168144226074, + "epoch": 1.7, + "learning_rate": 1.95906432748538e-05, + "loss": 4.3676, + "step": 680, + "task_loss": 0.4651721715927124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.014998945000000003, + "compression/movement_sparsity/importance_threshold": -0.00023576820152745127, + "compression/movement_sparsity/linear_layer_sparsity": 0.33866125366982835, + "compression/movement_sparsity/model_sparsity": 0.3044481507462172, + "compression_loss": 4.077360153198242, + "distillation_loss": 0.8586763143539429, + "epoch": 1.73, + "learning_rate": 1.988304093567251e-05, + "loss": 4.5084, + "step": 690, + "task_loss": 0.6287266612052917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.015435000000000004, + "compression/movement_sparsity/importance_threshold": -0.00023165605893518657, + "compression/movement_sparsity/linear_layer_sparsity": 0.3526366422952424, + "compression/movement_sparsity/model_sparsity": 0.317011681935159, + "compression_loss": 4.195735931396484, + "distillation_loss": 2.807767391204834, + "epoch": 1.75, + "learning_rate": 2.0175438596491224e-05, + "loss": 4.605, + "step": 700, + "task_loss": 1.069559097290039 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.015865955, + "compression/movement_sparsity/importance_threshold": -0.00022759201102643783, + "compression/movement_sparsity/linear_layer_sparsity": 0.36448323641222524, + "compression/movement_sparsity/model_sparsity": 0.32766147913656163, + "compression_loss": 4.312725067138672, + "distillation_loss": 0.7329788208007812, + "epoch": 1.78, + "learning_rate": 2.046783625730994e-05, + "loss": 4.7339, + "step": 710, + "task_loss": 0.3494850993156433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.016291840000000005, + "compression/movement_sparsity/importance_threshold": -0.00022357577489130195, + "compression/movement_sparsity/linear_layer_sparsity": 0.3769337760651912, + "compression/movement_sparsity/model_sparsity": 0.3388542085440818, + "compression_loss": 4.428336143493652, + "distillation_loss": 0.37443456053733826, + "epoch": 1.8, + "learning_rate": 2.0760233918128653e-05, + "loss": 4.8305, + "step": 720, + "task_loss": 0.24049115180969238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.016712685000000005, + "compression/movement_sparsity/importance_threshold": -0.00021960706761987598, + "compression/movement_sparsity/linear_layer_sparsity": 0.3912986134635652, + "compression/movement_sparsity/model_sparsity": 0.3517678446164528, + "compression_loss": 4.542544841766357, + "distillation_loss": 0.6300625801086426, + "epoch": 1.83, + "learning_rate": 2.1052631578947366e-05, + "loss": 4.9322, + "step": 730, + "task_loss": 0.3629845976829529 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.017128519999999994, + "compression/movement_sparsity/importance_threshold": -0.00021568560630225698, + "compression/movement_sparsity/linear_layer_sparsity": 0.40521868413128576, + "compression/movement_sparsity/model_sparsity": 0.36428164631983834, + "compression_loss": 4.655385971069336, + "distillation_loss": 0.6407253742218018, + "epoch": 1.85, + "learning_rate": 2.1345029239766078e-05, + "loss": 5.0662, + "step": 740, + "task_loss": 0.2929871678352356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.017539375000000003, + "compression/movement_sparsity/importance_threshold": -0.00021181110802854163, + "compression/movement_sparsity/linear_layer_sparsity": 0.42049019261894005, + "compression/movement_sparsity/model_sparsity": 0.37801035743688965, + "compression_loss": 4.766801357269287, + "distillation_loss": 0.8653441667556763, + "epoch": 1.88, + "learning_rate": 2.163742690058479e-05, + "loss": 5.1373, + "step": 750, + "task_loss": 0.5622042417526245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.017945279999999997, + "compression/movement_sparsity/importance_threshold": -0.00020798328988882715, + "compression/movement_sparsity/linear_layer_sparsity": 0.4363304068804577, + "compression/movement_sparsity/model_sparsity": 0.3922503210792747, + "compression_loss": 4.87684965133667, + "distillation_loss": 0.9700570702552795, + "epoch": 1.9, + "learning_rate": 2.1929824561403507e-05, + "loss": 5.3363, + "step": 760, + "task_loss": 0.39909374713897705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.018346265000000007, + "compression/movement_sparsity/importance_threshold": -0.0002042018689732103, + "compression/movement_sparsity/linear_layer_sparsity": 0.452758193597561, + "compression/movement_sparsity/model_sparsity": 0.407018497930564, + "compression_loss": 4.985548973083496, + "distillation_loss": 0.6564913988113403, + "epoch": 1.93, + "learning_rate": 2.222222222222222e-05, + "loss": 5.4895, + "step": 770, + "task_loss": 0.320431113243103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01874236, + "compression/movement_sparsity/importance_threshold": -0.00020046656237178833, + "compression/movement_sparsity/linear_layer_sparsity": 0.46551143527928335, + "compression/movement_sparsity/model_sparsity": 0.4184833490286625, + "compression_loss": 5.092935085296631, + "distillation_loss": 0.4097839295864105, + "epoch": 1.95, + "learning_rate": 2.2514619883040933e-05, + "loss": 5.6222, + "step": 780, + "task_loss": 0.12948280572891235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.019133595000000003, + "compression/movement_sparsity/importance_threshold": -0.0001967770871746579, + "compression/movement_sparsity/linear_layer_sparsity": 0.47700519657106294, + "compression/movement_sparsity/model_sparsity": 0.4288159581845132, + "compression_loss": 5.199016571044922, + "distillation_loss": 0.44741755723953247, + "epoch": 1.98, + "learning_rate": 2.2807017543859645e-05, + "loss": 5.5957, + "step": 790, + "task_loss": 0.12202805280685425 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.9545454545454546, + "eval_loss": 5.479804515838623, + "eval_runtime": 86.4849, + "eval_samples_per_second": 78.603, + "eval_steps_per_second": 2.463, + "step": 798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.019558376005000003, + "compression/movement_sparsity/importance_threshold": -0.0001927712620768021, + "compression/movement_sparsity/linear_layer_sparsity": 0.48800660098614873, + "compression/movement_sparsity/model_sparsity": 0.4387059505987312, + "compression_loss": 5.314239025115967, + "distillation_loss": 0.7526724934577942, + "epoch": 2.01, + "learning_rate": 2.3099415204678358e-05, + "loss": 5.8631, + "step": 800, + "task_loss": 0.40847355127334595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.019939502655, + "compression/movement_sparsity/importance_threshold": -0.00018917711195694988, + "compression/movement_sparsity/linear_layer_sparsity": 0.49847956187895215, + "compression/movement_sparsity/model_sparsity": 0.44812088526309046, + "compression_loss": 5.417582988739014, + "distillation_loss": 1.0302109718322754, + "epoch": 2.03, + "learning_rate": 2.3391812865497074e-05, + "loss": 5.8711, + "step": 810, + "task_loss": 0.4869253635406494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.020315862305000005, + "compression/movement_sparsity/importance_threshold": -0.0001856279162206899, + "compression/movement_sparsity/linear_layer_sparsity": 0.5098980799269798, + "compression/movement_sparsity/model_sparsity": 0.4583858525905118, + "compression_loss": 5.519580364227295, + "distillation_loss": 1.0941754579544067, + "epoch": 2.06, + "learning_rate": 2.3684210526315787e-05, + "loss": 5.9025, + "step": 820, + "task_loss": 0.6853399276733398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.020687484955000005, + "compression/movement_sparsity/importance_threshold": -0.00018212339195811914, + "compression/movement_sparsity/linear_layer_sparsity": 0.5205379474367661, + "compression/movement_sparsity/model_sparsity": 0.4679508322049121, + "compression_loss": 5.620279312133789, + "distillation_loss": 0.39905670285224915, + "epoch": 2.08, + "learning_rate": 2.39766081871345e-05, + "loss": 6.0413, + "step": 830, + "task_loss": 0.3336109519004822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.021054400605000002, + "compression/movement_sparsity/importance_threshold": -0.0001786632562593346, + "compression/movement_sparsity/linear_layer_sparsity": 0.5318343025444143, + "compression/movement_sparsity/model_sparsity": 0.47810597804881505, + "compression_loss": 5.719671726226807, + "distillation_loss": 1.3409157991409302, + "epoch": 2.11, + "learning_rate": 2.4269005847953213e-05, + "loss": 6.1711, + "step": 840, + "task_loss": 0.5142614841461182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.021416639255000005, + "compression/movement_sparsity/importance_threshold": -0.0001752472262144332, + "compression/movement_sparsity/linear_layer_sparsity": 0.5451425822417947, + "compression/movement_sparsity/model_sparsity": 0.4900697946932517, + "compression_loss": 5.817708492279053, + "distillation_loss": 1.0011571645736694, + "epoch": 2.13, + "learning_rate": 2.4561403508771925e-05, + "loss": 6.2293, + "step": 850, + "task_loss": 0.8053411841392517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.021774230905000004, + "compression/movement_sparsity/importance_threshold": -0.00017187501891351197, + "compression/movement_sparsity/linear_layer_sparsity": 0.5574857441282746, + "compression/movement_sparsity/model_sparsity": 0.5011659941255129, + "compression_loss": 5.914428234100342, + "distillation_loss": 0.577518105506897, + "epoch": 2.16, + "learning_rate": 2.4853801169590638e-05, + "loss": 6.3091, + "step": 860, + "task_loss": 0.6429996490478516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.022127205555, + "compression/movement_sparsity/importance_threshold": -0.00016854635144666782, + "compression/movement_sparsity/linear_layer_sparsity": 0.5681574050549534, + "compression/movement_sparsity/model_sparsity": 0.5107595552409675, + "compression_loss": 6.009877681732178, + "distillation_loss": 0.9484221935272217, + "epoch": 2.18, + "learning_rate": 2.5146198830409354e-05, + "loss": 6.4402, + "step": 870, + "task_loss": 0.5138943195343018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.022475593205000005, + "compression/movement_sparsity/importance_threshold": -0.00016526094090399767, + "compression/movement_sparsity/linear_layer_sparsity": 0.5798891288392051, + "compression/movement_sparsity/model_sparsity": 0.5213060868340473, + "compression_loss": 6.104116439819336, + "distillation_loss": 0.4834554195404053, + "epoch": 2.21, + "learning_rate": 2.5438596491228067e-05, + "loss": 6.497, + "step": 880, + "task_loss": 0.3683658838272095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.022819423855000003, + "compression/movement_sparsity/importance_threshold": -0.00016201850437559863, + "compression/movement_sparsity/linear_layer_sparsity": 0.5897178466576333, + "compression/movement_sparsity/model_sparsity": 0.5301418627948372, + "compression_loss": 6.197108745574951, + "distillation_loss": 0.4105452597141266, + "epoch": 2.23, + "learning_rate": 2.573099415204678e-05, + "loss": 6.601, + "step": 890, + "task_loss": 0.09200382232666016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.023158727504999997, + "compression/movement_sparsity/importance_threshold": -0.00015881875895156758, + "compression/movement_sparsity/linear_layer_sparsity": 0.5997669541365552, + "compression/movement_sparsity/model_sparsity": 0.5391757636484333, + "compression_loss": 6.288847923278809, + "distillation_loss": 0.6942120790481567, + "epoch": 2.26, + "learning_rate": 2.6023391812865492e-05, + "loss": 6.6452, + "step": 900, + "task_loss": 0.3441739082336426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.023493534155000006, + "compression/movement_sparsity/importance_threshold": -0.00015566142172200137, + "compression/movement_sparsity/linear_layer_sparsity": 0.6099394948810599, + "compression/movement_sparsity/model_sparsity": 0.548320627976711, + "compression_loss": 6.3793840408325195, + "distillation_loss": 0.8501518964767456, + "epoch": 2.28, + "learning_rate": 2.6315789473684205e-05, + "loss": 6.7787, + "step": 910, + "task_loss": 0.4232085049152374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.023823873805, + "compression/movement_sparsity/importance_threshold": -0.0001525462097769972, + "compression/movement_sparsity/linear_layer_sparsity": 0.6198110626505571, + "compression/movement_sparsity/model_sparsity": 0.5571949249912712, + "compression_loss": 6.468705177307129, + "distillation_loss": 0.4476397633552551, + "epoch": 2.31, + "learning_rate": 2.660818713450292e-05, + "loss": 6.8706, + "step": 920, + "task_loss": 0.26603904366493225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.024149776455000004, + "compression/movement_sparsity/importance_threshold": -0.00014947284020665178, + "compression/movement_sparsity/linear_layer_sparsity": 0.6292107046070461, + "compression/movement_sparsity/model_sparsity": 0.5656449722887386, + "compression_loss": 6.55678129196167, + "distillation_loss": 0.8385058641433716, + "epoch": 2.33, + "learning_rate": 2.6900584795321634e-05, + "loss": 7.0258, + "step": 930, + "task_loss": 0.5302727818489075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.024471272104999997, + "compression/movement_sparsity/importance_threshold": -0.00014644103010106235, + "compression/movement_sparsity/linear_layer_sparsity": 0.6367144802958447, + "compression/movement_sparsity/model_sparsity": 0.5723906823672124, + "compression_loss": 6.6436238288879395, + "distillation_loss": 0.4353128671646118, + "epoch": 2.36, + "learning_rate": 2.7192982456140347e-05, + "loss": 7.0157, + "step": 940, + "task_loss": 0.17246079444885254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.024788390755, + "compression/movement_sparsity/importance_threshold": -0.00014345049655032564, + "compression/movement_sparsity/linear_layer_sparsity": 0.6451845264980428, + "compression/movement_sparsity/model_sparsity": 0.5800050459091022, + "compression_loss": 6.729299545288086, + "distillation_loss": 0.4371870756149292, + "epoch": 2.38, + "learning_rate": 2.748538011695906e-05, + "loss": 7.135, + "step": 950, + "task_loss": 0.2832848131656647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.025101162405, + "compression/movement_sparsity/importance_threshold": -0.00014050095664453873, + "compression/movement_sparsity/linear_layer_sparsity": 0.653826243036736, + "compression/movement_sparsity/model_sparsity": 0.5877737368678947, + "compression_loss": 6.813790321350098, + "distillation_loss": 0.9059510231018066, + "epoch": 2.41, + "learning_rate": 2.7777777777777772e-05, + "loss": 7.2781, + "step": 960, + "task_loss": 0.3703336715698242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.025409617055, + "compression/movement_sparsity/importance_threshold": -0.00013759212747379855, + "compression/movement_sparsity/linear_layer_sparsity": 0.6619689099668774, + "compression/movement_sparsity/model_sparsity": 0.5950937944834637, + "compression_loss": 6.897071838378906, + "distillation_loss": 0.3913211226463318, + "epoch": 2.43, + "learning_rate": 2.807017543859649e-05, + "loss": 7.3028, + "step": 970, + "task_loss": 0.11039167642593384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.025713784705, + "compression/movement_sparsity/importance_threshold": -0.00013472372612820209, + "compression/movement_sparsity/linear_layer_sparsity": 0.6690985866455886, + "compression/movement_sparsity/model_sparsity": 0.6015031987383959, + "compression_loss": 6.979084491729736, + "distillation_loss": 0.7541512250900269, + "epoch": 2.46, + "learning_rate": 2.83625730994152e-05, + "loss": 7.3291, + "step": 980, + "task_loss": 0.5565091967582703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.026013695355, + "compression/movement_sparsity/importance_threshold": -0.0001318954696978463, + "compression/movement_sparsity/linear_layer_sparsity": 0.6808277697794339, + "compression/movement_sparsity/model_sparsity": 0.6120474463491494, + "compression_loss": 7.059789657592773, + "distillation_loss": 0.4585632085800171, + "epoch": 2.48, + "learning_rate": 2.8654970760233914e-05, + "loss": 7.4721, + "step": 990, + "task_loss": 0.2957744300365448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.026309379005, + "compression/movement_sparsity/importance_threshold": -0.00012910707527282811, + "compression/movement_sparsity/linear_layer_sparsity": 0.6891670665838603, + "compression/movement_sparsity/model_sparsity": 0.6195442693931776, + "compression_loss": 7.139459609985352, + "distillation_loss": 0.43910276889801025, + "epoch": 2.51, + "learning_rate": 2.8947368421052627e-05, + "loss": 7.5405, + "step": 1000, + "task_loss": 0.24162358045578003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.026600865655, + "compression/movement_sparsity/importance_threshold": -0.00012635825994324458, + "compression/movement_sparsity/linear_layer_sparsity": 0.6972308322041554, + "compression/movement_sparsity/model_sparsity": 0.6267933966687271, + "compression_loss": 7.217947483062744, + "distillation_loss": 0.6317572593688965, + "epoch": 2.53, + "learning_rate": 2.923976608187134e-05, + "loss": 7.5692, + "step": 1010, + "task_loss": 0.2147519886493683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.026888185305, + "compression/movement_sparsity/importance_threshold": -0.00012364874079919257, + "compression/movement_sparsity/linear_layer_sparsity": 0.7072264684018368, + "compression/movement_sparsity/model_sparsity": 0.6357792281535505, + "compression_loss": 7.2952799797058105, + "distillation_loss": 0.9247174263000488, + "epoch": 2.56, + "learning_rate": 2.9532163742690056e-05, + "loss": 7.7218, + "step": 1020, + "task_loss": 0.4004303216934204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.027171367955, + "compression/movement_sparsity/importance_threshold": -0.00012097823493076911, + "compression/movement_sparsity/linear_layer_sparsity": 0.71374941188648, + "compression/movement_sparsity/model_sparsity": 0.6416431941662016, + "compression_loss": 7.371541976928711, + "distillation_loss": 1.099261999130249, + "epoch": 2.58, + "learning_rate": 2.982456140350877e-05, + "loss": 7.7844, + "step": 1030, + "task_loss": 0.45426321029663086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.027450443605000003, + "compression/movement_sparsity/importance_threshold": -0.00011834645942807113, + "compression/movement_sparsity/linear_layer_sparsity": 0.7193642610471244, + "compression/movement_sparsity/model_sparsity": 0.6466908056811099, + "compression_loss": 7.4467082023620605, + "distillation_loss": 1.6781150102615356, + "epoch": 2.61, + "learning_rate": 3.011695906432748e-05, + "loss": 7.8146, + "step": 1040, + "task_loss": 0.8923892974853516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.027725442255, + "compression/movement_sparsity/importance_threshold": -0.00011575313138119565, + "compression/movement_sparsity/linear_layer_sparsity": 0.7244986567487203, + "compression/movement_sparsity/model_sparsity": 0.6513065013345436, + "compression_loss": 7.520779609680176, + "distillation_loss": 0.12322430312633514, + "epoch": 2.63, + "learning_rate": 3.0409356725146194e-05, + "loss": 7.8336, + "step": 1050, + "task_loss": 0.023247644305229187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.027996393905000003, + "compression/movement_sparsity/importance_threshold": -0.00011319796788023954, + "compression/movement_sparsity/linear_layer_sparsity": 0.7297716943315267, + "compression/movement_sparsity/model_sparsity": 0.6560468326346391, + "compression_loss": 7.5938239097595215, + "distillation_loss": 1.5146890878677368, + "epoch": 2.66, + "learning_rate": 3.070175438596491e-05, + "loss": 8.0602, + "step": 1060, + "task_loss": 0.8802045583724976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.028263328555, + "compression/movement_sparsity/importance_threshold": -0.0001106806860152999, + "compression/movement_sparsity/linear_layer_sparsity": 0.7346848182023487, + "compression/movement_sparsity/model_sparsity": 0.6604636103458478, + "compression_loss": 7.665744304656982, + "distillation_loss": 0.43239685893058777, + "epoch": 2.68, + "learning_rate": 3.099415204678362e-05, + "loss": 8.035, + "step": 1070, + "task_loss": 0.1623140275478363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.028526276205, + "compression/movement_sparsity/importance_threshold": -0.00010820100287647357, + "compression/movement_sparsity/linear_layer_sparsity": 0.7404099974593495, + "compression/movement_sparsity/model_sparsity": 0.6656104059080701, + "compression_loss": 7.736471176147461, + "distillation_loss": 0.46192824840545654, + "epoch": 2.71, + "learning_rate": 3.128654970760233e-05, + "loss": 8.0645, + "step": 1080, + "task_loss": 0.21585744619369507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.028785266855, + "compression/movement_sparsity/importance_threshold": -0.0001057586355538575, + "compression/movement_sparsity/linear_layer_sparsity": 0.7466543633318278, + "compression/movement_sparsity/model_sparsity": 0.6712239374882496, + "compression_loss": 7.806110858917236, + "distillation_loss": 2.1456828117370605, + "epoch": 2.73, + "learning_rate": 3.1578947368421045e-05, + "loss": 8.1415, + "step": 1090, + "task_loss": 1.1881322860717773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.029040330505000003, + "compression/movement_sparsity/importance_threshold": -0.0001033533011375488, + "compression/movement_sparsity/linear_layer_sparsity": 0.7539027095566094, + "compression/movement_sparsity/model_sparsity": 0.6777400227510018, + "compression_loss": 7.874512195587158, + "distillation_loss": 0.3218111991882324, + "epoch": 2.76, + "learning_rate": 3.1871345029239764e-05, + "loss": 8.2399, + "step": 1100, + "task_loss": 0.1626085638999939 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.029291497154999995, + "compression/movement_sparsity/importance_threshold": -0.00010098471671764442, + "compression/movement_sparsity/linear_layer_sparsity": 0.7604099739348088, + "compression/movement_sparsity/model_sparsity": 0.6835898936319823, + "compression_loss": 7.941775798797607, + "distillation_loss": 0.9839245676994324, + "epoch": 2.78, + "learning_rate": 3.216374269005848e-05, + "loss": 8.3905, + "step": 1110, + "task_loss": 0.3942227065563202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.029538796805, + "compression/movement_sparsity/importance_threshold": -9.86525993842411e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.766957206507829, + "compression/movement_sparsity/model_sparsity": 0.6894756949386317, + "compression_loss": 8.007960319519043, + "distillation_loss": 0.7333301305770874, + "epoch": 2.81, + "learning_rate": 3.245614035087719e-05, + "loss": 8.397, + "step": 1120, + "task_loss": 0.3495325744152069 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.029782259455, + "compression/movement_sparsity/importance_threshold": -9.635666622743598e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.7723407153342366, + "compression/movement_sparsity/model_sparsity": 0.6943153371739483, + "compression_loss": 8.0732421875, + "distillation_loss": 0.6021356582641602, + "epoch": 2.83, + "learning_rate": 3.27485380116959e-05, + "loss": 8.4913, + "step": 1130, + "task_loss": 0.40355244278907776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.030021915105000005, + "compression/movement_sparsity/importance_threshold": -9.409663433732605e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.7760336447982535, + "compression/movement_sparsity/model_sparsity": 0.6976351900770273, + "compression_loss": 8.137558937072754, + "distillation_loss": 0.49863767623901367, + "epoch": 2.86, + "learning_rate": 3.3040935672514615e-05, + "loss": 8.5122, + "step": 1140, + "task_loss": 0.14769184589385986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.030257793755, + "compression/movement_sparsity/importance_threshold": -9.187222080400827e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.7808857460102379, + "compression/movement_sparsity/model_sparsity": 0.7019971099164379, + "compression_loss": 8.200807571411133, + "distillation_loss": 0.9741989970207214, + "epoch": 2.88, + "learning_rate": 3.333333333333333e-05, + "loss": 8.5404, + "step": 1150, + "task_loss": 0.5826123952865601 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.030489925404999996, + "compression/movement_sparsity/importance_threshold": -8.968314271757958e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.7852331987729599, + "compression/movement_sparsity/model_sparsity": 0.7059053631923137, + "compression_loss": 8.263005256652832, + "distillation_loss": 0.46970468759536743, + "epoch": 2.91, + "learning_rate": 3.362573099415204e-05, + "loss": 8.6595, + "step": 1160, + "task_loss": 0.5217117071151733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.030718340054999994, + "compression/movement_sparsity/importance_threshold": -8.752911716813688e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.7899744405864197, + "compression/movement_sparsity/model_sparsity": 0.710167623154759, + "compression_loss": 8.32394790649414, + "distillation_loss": 0.6907594203948975, + "epoch": 2.93, + "learning_rate": 3.3918128654970754e-05, + "loss": 8.7009, + "step": 1170, + "task_loss": 0.17913195490837097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.030943067705000004, + "compression/movement_sparsity/importance_threshold": -8.540986124577711e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.7945701713527552, + "compression/movement_sparsity/model_sparsity": 0.7142990722590679, + "compression_loss": 8.384001731872559, + "distillation_loss": 0.9414887428283691, + "epoch": 2.96, + "learning_rate": 3.4210526315789466e-05, + "loss": 8.8272, + "step": 1180, + "task_loss": 0.6663200259208679 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.031164138355, + "compression/movement_sparsity/importance_threshold": -8.33250920405974e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.7987915561013249, + "compression/movement_sparsity/model_sparsity": 0.7180939934859978, + "compression_loss": 8.443134307861328, + "distillation_loss": 0.538488507270813, + "epoch": 2.98, + "learning_rate": 3.450292397660818e-05, + "loss": 8.7806, + "step": 1190, + "task_loss": 0.26449117064476013 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.9633715798764343, + "eval_loss": 8.649137496948242, + "eval_runtime": 86.2151, + "eval_samples_per_second": 78.849, + "eval_steps_per_second": 2.471, + "step": 1197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03140312804, + "compression/movement_sparsity/importance_threshold": -8.107134042027329e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8017074229336043, + "compression/movement_sparsity/model_sparsity": 0.7207152861650097, + "compression_loss": 8.507065773010254, + "distillation_loss": 0.6942938566207886, + "epoch": 3.01, + "learning_rate": 3.47953216374269e-05, + "loss": 9.0818, + "step": 1200, + "task_loss": 0.3004140853881836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03161661664, + "compression/movement_sparsity/importance_threshold": -7.905807244943714e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.804804922745408, + "compression/movement_sparsity/model_sparsity": 0.7234998624323609, + "compression_loss": 8.564077377319336, + "distillation_loss": 0.49231940507888794, + "epoch": 3.03, + "learning_rate": 3.508771929824561e-05, + "loss": 8.9, + "step": 1210, + "task_loss": 0.2517680525779724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03182654124, + "compression/movement_sparsity/importance_threshold": -7.70784141750815e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8087666200880759, + "compression/movement_sparsity/model_sparsity": 0.7270613310583741, + "compression_loss": 8.620096206665039, + "distillation_loss": 0.5323042273521423, + "epoch": 3.06, + "learning_rate": 3.5380116959064324e-05, + "loss": 8.9514, + "step": 1220, + "task_loss": 0.19435852766036987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03203293184, + "compression/movement_sparsity/importance_threshold": -7.51320826873034e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8120925784778681, + "compression/movement_sparsity/model_sparsity": 0.7300512859772156, + "compression_loss": 8.675074577331543, + "distillation_loss": 0.7101088762283325, + "epoch": 3.08, + "learning_rate": 3.567251461988304e-05, + "loss": 9.0237, + "step": 1230, + "task_loss": 0.5060630440711975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03223581844, + "compression/movement_sparsity/importance_threshold": -7.321879507619987e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8161153220039145, + "compression/movement_sparsity/model_sparsity": 0.7336676336230114, + "compression_loss": 8.729106903076172, + "distillation_loss": 0.5983076095581055, + "epoch": 3.11, + "learning_rate": 3.5964912280701756e-05, + "loss": 9.0547, + "step": 1240, + "task_loss": 0.27109333872795105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03243523104, + "compression/movement_sparsity/importance_threshold": -7.133826843186772e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8184793031090033, + "compression/movement_sparsity/model_sparsity": 0.7357927945856084, + "compression_loss": 8.78226375579834, + "distillation_loss": 0.8876118659973145, + "epoch": 3.13, + "learning_rate": 3.625730994152046e-05, + "loss": 9.1706, + "step": 1250, + "task_loss": 0.4600517749786377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03263119964, + "compression/movement_sparsity/importance_threshold": -6.949021984440403e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8217362052092743, + "compression/movement_sparsity/model_sparsity": 0.7387206695959445, + "compression_loss": 8.834550857543945, + "distillation_loss": 0.35698211193084717, + "epoch": 3.16, + "learning_rate": 3.654970760233918e-05, + "loss": 9.1858, + "step": 1260, + "task_loss": 0.1868249475955963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03282375424, + "compression/movement_sparsity/importance_threshold": -6.767436640390569e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8237155835968082, + "compression/movement_sparsity/model_sparsity": 0.7405000821599198, + "compression_loss": 8.88587760925293, + "distillation_loss": 1.1052513122558594, + "epoch": 3.18, + "learning_rate": 3.684210526315789e-05, + "loss": 9.3054, + "step": 1270, + "task_loss": 0.6997582316398621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03301292484, + "compression/movement_sparsity/importance_threshold": -6.58904252004697e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8271621287827462, + "compression/movement_sparsity/model_sparsity": 0.743598441647318, + "compression_loss": 8.936222076416016, + "distillation_loss": 0.47881120443344116, + "epoch": 3.21, + "learning_rate": 3.713450292397661e-05, + "loss": 9.2533, + "step": 1280, + "task_loss": 0.2920542061328888 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03319874144, + "compression/movement_sparsity/importance_threshold": -6.413811332419306e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8296238778794038, + "compression/movement_sparsity/model_sparsity": 0.745811493633505, + "compression_loss": 8.985671997070312, + "distillation_loss": 1.6269311904907227, + "epoch": 3.23, + "learning_rate": 3.7426900584795313e-05, + "loss": 9.2957, + "step": 1290, + "task_loss": 0.6221576929092407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03338123404, + "compression/movement_sparsity/importance_threshold": -6.241714786517268e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8331334335478772, + "compression/movement_sparsity/model_sparsity": 0.7489664979973916, + "compression_loss": 9.034184455871582, + "distillation_loss": 1.1283040046691895, + "epoch": 3.26, + "learning_rate": 3.771929824561403e-05, + "loss": 9.4048, + "step": 1300, + "task_loss": 0.692903459072113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.033560432640000006, + "compression/movement_sparsity/importance_threshold": -6.0727245913505545e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8352013465447155, + "compression/movement_sparsity/model_sparsity": 0.7508255010010398, + "compression_loss": 9.081707000732422, + "distillation_loss": 0.40741151571273804, + "epoch": 3.28, + "learning_rate": 3.8011695906432746e-05, + "loss": 9.4243, + "step": 1310, + "task_loss": 0.13411134481430054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03373636724, + "compression/movement_sparsity/importance_threshold": -5.906812455928862e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8383988256549232, + "compression/movement_sparsity/model_sparsity": 0.7536999562025241, + "compression_loss": 9.128183364868164, + "distillation_loss": 0.6723507046699524, + "epoch": 3.31, + "learning_rate": 3.830409356725146e-05, + "loss": 9.5088, + "step": 1320, + "task_loss": 0.2996169328689575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03390906784, + "compression/movement_sparsity/importance_threshold": -5.7439500892618944e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8412121019647697, + "compression/movement_sparsity/model_sparsity": 0.7562290225211231, + "compression_loss": 9.1739501953125, + "distillation_loss": 0.4882902503013611, + "epoch": 3.33, + "learning_rate": 3.859649122807017e-05, + "loss": 9.4872, + "step": 1330, + "task_loss": 0.26708441972732544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03407856444, + "compression/movement_sparsity/importance_threshold": -5.5841092003593325e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8429411557136405, + "compression/movement_sparsity/model_sparsity": 0.7577833993820136, + "compression_loss": 9.218880653381348, + "distillation_loss": 0.8691713809967041, + "epoch": 3.36, + "learning_rate": 3.8888888888888884e-05, + "loss": 9.5652, + "step": 1340, + "task_loss": 0.5940840840339661 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03424488704, + "compression/movement_sparsity/importance_threshold": -5.427261498230885e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8445117246311352, + "compression/movement_sparsity/model_sparsity": 0.7591953022713147, + "compression_loss": 9.263068199157715, + "distillation_loss": 0.4419710636138916, + "epoch": 3.38, + "learning_rate": 3.91812865497076e-05, + "loss": 9.546, + "step": 1350, + "task_loss": 0.26113927364349365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.034408065640000006, + "compression/movement_sparsity/importance_threshold": -5.2733786918862386e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.846551690473502, + "compression/movement_sparsity/model_sparsity": 0.7610291814693747, + "compression_loss": 9.306303977966309, + "distillation_loss": 1.0447174310684204, + "epoch": 3.41, + "learning_rate": 3.9473684210526316e-05, + "loss": 9.6604, + "step": 1360, + "task_loss": 0.6508020162582397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03456813024, + "compression/movement_sparsity/importance_threshold": -5.122432490335102e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8479912041741945, + "compression/movement_sparsity/model_sparsity": 0.7623232689370156, + "compression_loss": 9.348631858825684, + "distillation_loss": 0.6512007713317871, + "epoch": 3.43, + "learning_rate": 3.976608187134502e-05, + "loss": 9.6694, + "step": 1370, + "task_loss": 0.38973358273506165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03472511084, + "compression/movement_sparsity/importance_threshold": -4.974394602587167e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8504058218533574, + "compression/movement_sparsity/model_sparsity": 0.7644939509362529, + "compression_loss": 9.389874458312988, + "distillation_loss": 1.7942403554916382, + "epoch": 3.46, + "learning_rate": 4.005847953216374e-05, + "loss": 9.7758, + "step": 1380, + "task_loss": 0.7294902801513672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03487903744, + "compression/movement_sparsity/importance_threshold": -4.8292367376521204e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8521922637195122, + "compression/movement_sparsity/model_sparsity": 0.7660999183053359, + "compression_loss": 9.430490493774414, + "distillation_loss": 0.4566546678543091, + "epoch": 3.48, + "learning_rate": 4.035087719298245e-05, + "loss": 9.8213, + "step": 1390, + "task_loss": 0.14384031295776367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03502994004, + "compression/movement_sparsity/importance_threshold": -4.686930604539676e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8542740561954231, + "compression/movement_sparsity/model_sparsity": 0.7679713986198402, + "compression_loss": 9.47036361694336, + "distillation_loss": 0.6844474673271179, + "epoch": 3.51, + "learning_rate": 4.064327485380117e-05, + "loss": 9.8039, + "step": 1400, + "task_loss": 0.5877740979194641 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03517784864, + "compression/movement_sparsity/importance_threshold": -4.547447912259516e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8564435834462512, + "compression/movement_sparsity/model_sparsity": 0.7699217503425445, + "compression_loss": 9.50930118560791, + "distillation_loss": 1.0622820854187012, + "epoch": 3.53, + "learning_rate": 4.093567251461988e-05, + "loss": 9.821, + "step": 1410, + "task_loss": 0.4445667266845703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.035322793240000004, + "compression/movement_sparsity/importance_threshold": -4.4107603698213425e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8580350892050587, + "compression/movement_sparsity/model_sparsity": 0.7713524749380523, + "compression_loss": 9.547521591186523, + "distillation_loss": 0.6739060282707214, + "epoch": 3.56, + "learning_rate": 4.122807017543859e-05, + "loss": 9.8988, + "step": 1420, + "task_loss": 0.3877226710319519 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03546480384, + "compression/movement_sparsity/importance_threshold": -4.2768396862348534e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8592993215522433, + "compression/movement_sparsity/model_sparsity": 0.7724889887731695, + "compression_loss": 9.585060119628906, + "distillation_loss": 0.45944637060165405, + "epoch": 3.58, + "learning_rate": 4.1520467836257306e-05, + "loss": 9.9251, + "step": 1430, + "task_loss": 0.2797248959541321 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.035603910440000004, + "compression/movement_sparsity/importance_threshold": -4.145657570509741e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.860241573509485, + "compression/movement_sparsity/model_sparsity": 0.773336050144409, + "compression_loss": 9.62182903289795, + "distillation_loss": 0.20794469118118286, + "epoch": 3.61, + "learning_rate": 4.181286549707602e-05, + "loss": 9.9025, + "step": 1440, + "task_loss": 0.049867182970047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.035740143040000005, + "compression/movement_sparsity/importance_threshold": -4.017185731655708e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.861661949996236, + "compression/movement_sparsity/model_sparsity": 0.7746129337266576, + "compression_loss": 9.657830238342285, + "distillation_loss": 1.0586156845092773, + "epoch": 3.63, + "learning_rate": 4.210526315789473e-05, + "loss": 9.9896, + "step": 1450, + "task_loss": 0.6514222025871277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.035873531640000005, + "compression/movement_sparsity/importance_threshold": -3.891395878682447e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8629507537262873, + "compression/movement_sparsity/model_sparsity": 0.775771536631587, + "compression_loss": 9.692946434020996, + "distillation_loss": 0.4212111234664917, + "epoch": 3.66, + "learning_rate": 4.239766081871345e-05, + "loss": 10.0329, + "step": 1460, + "task_loss": 0.21880501508712769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03600410624, + "compression/movement_sparsity/importance_threshold": -3.76825972059965e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.864051102359982, + "compression/movement_sparsity/model_sparsity": 0.7767607230326717, + "compression_loss": 9.7272367477417, + "distillation_loss": 0.5302681922912598, + "epoch": 3.68, + "learning_rate": 4.2690058479532157e-05, + "loss": 10.0569, + "step": 1470, + "task_loss": 0.4268465042114258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.036131896840000004, + "compression/movement_sparsity/importance_threshold": -3.647748966417025e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8658141373080398, + "compression/movement_sparsity/model_sparsity": 0.7783456481571752, + "compression_loss": 9.760616302490234, + "distillation_loss": 0.399345338344574, + "epoch": 3.71, + "learning_rate": 4.2982456140350876e-05, + "loss": 10.0997, + "step": 1480, + "task_loss": 0.30424150824546814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03625693344, + "compression/movement_sparsity/importance_threshold": -3.529835325144254e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8674597847975007, + "compression/movement_sparsity/model_sparsity": 0.7798250448390142, + "compression_loss": 9.793109893798828, + "distillation_loss": 0.43434837460517883, + "epoch": 3.73, + "learning_rate": 4.327485380116958e-05, + "loss": 10.1041, + "step": 1490, + "task_loss": 0.2279742956161499 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03637924604, + "compression/movement_sparsity/importance_threshold": -3.4144905057910456e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8694156856933153, + "compression/movement_sparsity/model_sparsity": 0.7815833517144565, + "compression_loss": 9.824585914611816, + "distillation_loss": 0.8438689708709717, + "epoch": 3.76, + "learning_rate": 4.35672514619883e-05, + "loss": 10.1255, + "step": 1500, + "task_loss": 0.44409793615341187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03649886464, + "compression/movement_sparsity/importance_threshold": -3.3016862173670914e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8707779212774767, + "compression/movement_sparsity/model_sparsity": 0.7828079680530087, + "compression_loss": 9.855549812316895, + "distillation_loss": 0.7053812742233276, + "epoch": 3.78, + "learning_rate": 4.3859649122807014e-05, + "loss": 10.2668, + "step": 1510, + "task_loss": 0.37573811411857605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03661581924, + "compression/movement_sparsity/importance_threshold": -3.191394168882084e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8729380622365251, + "compression/movement_sparsity/model_sparsity": 0.7847498817298967, + "compression_loss": 9.886014938354492, + "distillation_loss": 0.4807725250720978, + "epoch": 3.81, + "learning_rate": 4.415204678362573e-05, + "loss": 10.2452, + "step": 1520, + "task_loss": 0.16565313935279846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03673013984, + "compression/movement_sparsity/importance_threshold": -3.083586069345726e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8737130194218609, + "compression/movement_sparsity/model_sparsity": 0.7854465492093509, + "compression_loss": 9.915853500366211, + "distillation_loss": 0.5819356441497803, + "epoch": 3.83, + "learning_rate": 4.444444444444444e-05, + "loss": 10.2462, + "step": 1530, + "task_loss": 0.3054378628730774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03684185644, + "compression/movement_sparsity/importance_threshold": -2.9782336277677207e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8748435029923216, + "compression/movement_sparsity/model_sparsity": 0.7864628261785824, + "compression_loss": 9.944876670837402, + "distillation_loss": 0.7969239354133606, + "epoch": 3.86, + "learning_rate": 4.473684210526315e-05, + "loss": 10.3467, + "step": 1540, + "task_loss": 0.3591158986091614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03695099904, + "compression/movement_sparsity/importance_threshold": -2.875308553157744e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8757265789671785, + "compression/movement_sparsity/model_sparsity": 0.7872566897948075, + "compression_loss": 9.973067283630371, + "distillation_loss": 0.6269478797912598, + "epoch": 3.88, + "learning_rate": 4.5029239766081865e-05, + "loss": 10.2859, + "step": 1550, + "task_loss": 0.30371320247650146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03705759764, + "compression/movement_sparsity/importance_threshold": -2.77478255452551e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8767618469587474, + "compression/movement_sparsity/model_sparsity": 0.7881873702967676, + "compression_loss": 10.00046443939209, + "distillation_loss": 0.4931795001029968, + "epoch": 3.91, + "learning_rate": 4.5321637426900585e-05, + "loss": 10.3543, + "step": 1560, + "task_loss": 0.308719664812088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03716168224, + "compression/movement_sparsity/importance_threshold": -2.6766273408807104e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8784133638211382, + "compression/movement_sparsity/model_sparsity": 0.7896720434007397, + "compression_loss": 10.027097702026367, + "distillation_loss": 0.1957966387271881, + "epoch": 3.93, + "learning_rate": 4.561403508771929e-05, + "loss": 10.292, + "step": 1570, + "task_loss": 0.058112651109695435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03726328284, + "compression/movement_sparsity/importance_threshold": -2.5808146212330377e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8799685947380307, + "compression/movement_sparsity/model_sparsity": 0.7910701578041456, + "compression_loss": 10.053182601928711, + "distillation_loss": 0.5972878932952881, + "epoch": 3.96, + "learning_rate": 4.590643274853801e-05, + "loss": 10.4266, + "step": 1580, + "task_loss": 0.23346292972564697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03736242944, + "compression/movement_sparsity/importance_threshold": -2.4873161045922004e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8811241084199036, + "compression/movement_sparsity/model_sparsity": 0.7921089362288872, + "compression_loss": 10.078638076782227, + "distillation_loss": 0.35232701897621155, + "epoch": 3.98, + "learning_rate": 4.6198830409356716e-05, + "loss": 10.4524, + "step": 1590, + "task_loss": 0.06011241674423218 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.9554280670785525, + "eval_loss": 10.270051956176758, + "eval_runtime": 86.3945, + "eval_samples_per_second": 78.686, + "eval_steps_per_second": 2.465, + "step": 1596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.037468692134999995, + "compression/movement_sparsity/importance_threshold": -2.3871068754632266e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8816427657332129, + "compression/movement_sparsity/model_sparsity": 0.7925751964171931, + "compression_loss": 10.105827331542969, + "distillation_loss": 0.40998631715774536, + "epoch": 4.01, + "learning_rate": 4.6491228070175436e-05, + "loss": 10.7045, + "step": 1600, + "task_loss": 0.21127372980117798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.037562782985, + "compression/movement_sparsity/importance_threshold": -2.298376097963279e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8831190835779885, + "compression/movement_sparsity/model_sparsity": 0.7939023699065872, + "compression_loss": 10.129731178283691, + "distillation_loss": 0.547886312007904, + "epoch": 4.04, + "learning_rate": 4.678362573099415e-05, + "loss": 10.4625, + "step": 1610, + "task_loss": 0.2604964077472687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.037654512835, + "compression/movement_sparsity/importance_threshold": -2.2118718214002167e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8844327409853959, + "compression/movement_sparsity/model_sparsity": 0.795083315657143, + "compression_loss": 10.153118133544922, + "distillation_loss": 0.3340034782886505, + "epoch": 4.06, + "learning_rate": 4.707602339181286e-05, + "loss": 10.4671, + "step": 1620, + "task_loss": 0.111962229013443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.037743911685000006, + "compression/movement_sparsity/importance_threshold": -2.1275657547837376e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8854097386894008, + "compression/movement_sparsity/model_sparsity": 0.7959616126014918, + "compression_loss": 10.175905227661133, + "distillation_loss": 0.7999333143234253, + "epoch": 4.09, + "learning_rate": 4.7368421052631574e-05, + "loss": 10.5007, + "step": 1630, + "task_loss": 0.41760003566741943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.037831009535, + "compression/movement_sparsity/importance_threshold": -2.045429607123539e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8865368700127974, + "compression/movement_sparsity/model_sparsity": 0.7969748759829319, + "compression_loss": 10.197953224182129, + "distillation_loss": 0.8680118322372437, + "epoch": 4.11, + "learning_rate": 4.766081871345029e-05, + "loss": 10.548, + "step": 1640, + "task_loss": 0.4344426393508911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.037915836385, + "compression/movement_sparsity/importance_threshold": -1.9654350874293187e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8878363656466426, + "compression/movement_sparsity/model_sparsity": 0.7981430906468174, + "compression_loss": 10.219234466552734, + "distillation_loss": 0.37934863567352295, + "epoch": 4.14, + "learning_rate": 4.7953216374269e-05, + "loss": 10.6091, + "step": 1650, + "task_loss": 0.16707447171211243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.037998422235, + "compression/movement_sparsity/importance_threshold": -1.8875539047107635e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8892580006963264, + "compression/movement_sparsity/model_sparsity": 0.7994211056462369, + "compression_loss": 10.240059852600098, + "distillation_loss": 0.2461932748556137, + "epoch": 4.16, + "learning_rate": 4.824561403508772e-05, + "loss": 10.5569, + "step": 1660, + "task_loss": 0.21289575099945068 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.038078797085000005, + "compression/movement_sparsity/importance_threshold": -1.8117577679775763e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8904640568541102, + "compression/movement_sparsity/model_sparsity": 0.8005053205156809, + "compression_loss": 10.260123252868652, + "distillation_loss": 0.2822727560997009, + "epoch": 4.19, + "learning_rate": 4.8538011695906425e-05, + "loss": 10.5789, + "step": 1670, + "task_loss": 0.10792559385299683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.038156990935, + "compression/movement_sparsity/importance_threshold": -1.73801838623946e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8910806821175851, + "compression/movement_sparsity/model_sparsity": 0.801059651485445, + "compression_loss": 10.279339790344238, + "distillation_loss": 0.46864134073257446, + "epoch": 4.21, + "learning_rate": 4.8830409356725145e-05, + "loss": 10.6432, + "step": 1680, + "task_loss": 0.26517313718795776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.038233033785, + "compression/movement_sparsity/importance_threshold": -1.666307468506102e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8926093067788317, + "compression/movement_sparsity/model_sparsity": 0.8024338475183792, + "compression_loss": 10.29808521270752, + "distillation_loss": 0.48441100120544434, + "epoch": 4.24, + "learning_rate": 4.912280701754385e-05, + "loss": 10.5869, + "step": 1690, + "task_loss": 0.33757680654525757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.038306955635, + "compression/movement_sparsity/importance_threshold": -1.596596723787199e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8934476390770852, + "compression/movement_sparsity/model_sparsity": 0.8031874876680816, + "compression_loss": 10.316293716430664, + "distillation_loss": 0.4501994550228119, + "epoch": 4.26, + "learning_rate": 4.941520467836257e-05, + "loss": 10.6399, + "step": 1700, + "task_loss": 0.1676151156425476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.038378786485, + "compression/movement_sparsity/importance_threshold": -1.5288578610924547e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8948277650745258, + "compression/movement_sparsity/model_sparsity": 0.8044281870488474, + "compression_loss": 10.333884239196777, + "distillation_loss": 0.3706062138080597, + "epoch": 4.29, + "learning_rate": 4.9707602339181276e-05, + "loss": 10.66, + "step": 1710, + "task_loss": 0.272488534450531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.038448556335, + "compression/movement_sparsity/importance_threshold": -1.4630625894315556e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8957719930931948, + "compression/movement_sparsity/model_sparsity": 0.8052770248507851, + "compression_loss": 10.350982666015625, + "distillation_loss": 0.6956163644790649, + "epoch": 4.31, + "learning_rate": 4.9999999999999996e-05, + "loss": 10.6792, + "step": 1720, + "task_loss": 0.3349671959877014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.038516295185, + "compression/movement_sparsity/importance_threshold": -1.3991826178142103e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8963361351061427, + "compression/movement_sparsity/model_sparsity": 0.8057841746671255, + "compression_loss": 10.367738723754883, + "distillation_loss": 1.0592408180236816, + "epoch": 4.34, + "learning_rate": 5.029239766081871e-05, + "loss": 10.6682, + "step": 1730, + "task_loss": 0.615516722202301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.038582033035, + "compression/movement_sparsity/importance_threshold": -1.3371896552501054e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.896996621875941, + "compression/movement_sparsity/model_sparsity": 0.8063779360539933, + "compression_loss": 10.383992195129395, + "distillation_loss": 0.4240041673183441, + "epoch": 4.36, + "learning_rate": 5.058479532163742e-05, + "loss": 10.802, + "step": 1740, + "task_loss": 0.16404575109481812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.038645799885, + "compression/movement_sparsity/importance_threshold": -1.2770554107489386e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8979182428109004, + "compression/movement_sparsity/model_sparsity": 0.8072064506428247, + "compression_loss": 10.399662017822266, + "distillation_loss": 0.40452438592910767, + "epoch": 4.39, + "learning_rate": 5.0877192982456134e-05, + "loss": 10.7784, + "step": 1750, + "task_loss": 0.19599807262420654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.038707625735, + "compression/movement_sparsity/importance_threshold": -1.2187515933204075e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8988828077950919, + "compression/movement_sparsity/model_sparsity": 0.8080735708773643, + "compression_loss": 10.414520263671875, + "distillation_loss": 0.8717334866523743, + "epoch": 4.41, + "learning_rate": 5.1169590643274853e-05, + "loss": 10.7116, + "step": 1760, + "task_loss": 0.3821563720703125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.038767540585, + "compression/movement_sparsity/importance_threshold": -1.162249911974215e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9006762482121349, + "compression/movement_sparsity/model_sparsity": 0.8096858297718366, + "compression_loss": 10.428637504577637, + "distillation_loss": 0.6464477777481079, + "epoch": 4.44, + "learning_rate": 5.146198830409356e-05, + "loss": 10.7727, + "step": 1770, + "task_loss": 0.4439537823200226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.038825574435, + "compression/movement_sparsity/importance_threshold": -1.1075220757200482e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9014851630721168, + "compression/movement_sparsity/model_sparsity": 0.8104130243669192, + "compression_loss": 10.442300796508789, + "distillation_loss": 0.9200720191001892, + "epoch": 4.46, + "learning_rate": 5.175438596491228e-05, + "loss": 10.7621, + "step": 1780, + "task_loss": 0.31766802072525024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.038881757285, + "compression/movement_sparsity/importance_threshold": -1.0545397935676098e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9020580444519724, + "compression/movement_sparsity/model_sparsity": 0.810928030659502, + "compression_loss": 10.45583724975586, + "distillation_loss": 0.8315671682357788, + "epoch": 4.49, + "learning_rate": 5.2046783625730985e-05, + "loss": 10.8142, + "step": 1790, + "task_loss": 0.42072778940200806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.038936119135, + "compression/movement_sparsity/importance_threshold": -1.0032747745265921e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9024175582467631, + "compression/movement_sparsity/model_sparsity": 0.8112512247326524, + "compression_loss": 10.469006538391113, + "distillation_loss": 0.5176489353179932, + "epoch": 4.51, + "learning_rate": 5.2339181286549704e-05, + "loss": 10.9226, + "step": 1800, + "task_loss": 0.2870589792728424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.038988689985, + "compression/movement_sparsity/importance_threshold": -9.536987276066982e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9029426612842517, + "compression/movement_sparsity/model_sparsity": 0.8117232794687119, + "compression_loss": 10.481606483459473, + "distillation_loss": 0.5399871468544006, + "epoch": 4.54, + "learning_rate": 5.263157894736841e-05, + "loss": 10.8043, + "step": 1810, + "task_loss": 0.26069051027297974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039039499835, + "compression/movement_sparsity/importance_threshold": -9.057833618176148e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.903573718853508, + "compression/movement_sparsity/model_sparsity": 0.8122905847269679, + "compression_loss": 10.493739128112793, + "distillation_loss": 0.3877241611480713, + "epoch": 4.56, + "learning_rate": 5.292397660818713e-05, + "loss": 10.807, + "step": 1820, + "task_loss": 0.24742752313613892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039088578685, + "compression/movement_sparsity/importance_threshold": -8.595003861690395e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.904272821157031, + "compression/movement_sparsity/model_sparsity": 0.8129190605303956, + "compression_loss": 10.505409240722656, + "distillation_loss": 0.25960269570350647, + "epoch": 4.59, + "learning_rate": 5.321637426900584e-05, + "loss": 10.8415, + "step": 1830, + "task_loss": 0.06999611854553223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039135956535, + "compression/movement_sparsity/importance_threshold": -8.148215096706807e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9051532388587775, + "compression/movement_sparsity/model_sparsity": 0.8137105344243719, + "compression_loss": 10.516351699829102, + "distillation_loss": 0.3734590411186218, + "epoch": 4.61, + "learning_rate": 5.3508771929824555e-05, + "loss": 10.8276, + "step": 1840, + "task_loss": 0.2790951132774353 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039181663385, + "compression/movement_sparsity/importance_threshold": -7.717184413322253e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9060454423554652, + "compression/movement_sparsity/model_sparsity": 0.8145126034585837, + "compression_loss": 10.526989936828613, + "distillation_loss": 0.40441250801086426, + "epoch": 4.64, + "learning_rate": 5.380116959064327e-05, + "loss": 10.8386, + "step": 1850, + "task_loss": 0.16886454820632935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039225729235, + "compression/movement_sparsity/importance_threshold": -7.301628901633709e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9065582185335742, + "compression/movement_sparsity/model_sparsity": 0.8149735766507641, + "compression_loss": 10.537022590637207, + "distillation_loss": 0.5824317336082458, + "epoch": 4.66, + "learning_rate": 5.409356725146199e-05, + "loss": 10.859, + "step": 1860, + "task_loss": 0.5143399834632874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039268184085, + "compression/movement_sparsity/importance_threshold": -6.90126565173815e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9073361398110509, + "compression/movement_sparsity/model_sparsity": 0.8156729087762655, + "compression_loss": 10.546780586242676, + "distillation_loss": 0.5486133098602295, + "epoch": 4.69, + "learning_rate": 5.4385964912280694e-05, + "loss": 10.8576, + "step": 1870, + "task_loss": 0.30259478092193604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039309057935, + "compression/movement_sparsity/importance_threshold": -6.515811753732498e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9080930419113219, + "compression/movement_sparsity/model_sparsity": 0.8163533451776145, + "compression_loss": 10.556109428405762, + "distillation_loss": 1.0528051853179932, + "epoch": 4.71, + "learning_rate": 5.467836257309941e-05, + "loss": 10.8712, + "step": 1880, + "task_loss": 0.43705570697784424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039348380785, + "compression/movement_sparsity/importance_threshold": -6.1449842977138384e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9087980606368564, + "compression/movement_sparsity/model_sparsity": 0.8169871396991445, + "compression_loss": 10.565235137939453, + "distillation_loss": 0.3459569215774536, + "epoch": 4.74, + "learning_rate": 5.497076023391812e-05, + "loss": 10.8912, + "step": 1890, + "task_loss": 0.12038382887840271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039386182635, + "compression/movement_sparsity/importance_threshold": -5.788500373778984e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.909603117472147, + "compression/movement_sparsity/model_sparsity": 0.817710866024769, + "compression_loss": 10.573878288269043, + "distillation_loss": 0.5916445851325989, + "epoch": 4.76, + "learning_rate": 5.526315789473684e-05, + "loss": 10.9138, + "step": 1900, + "task_loss": 0.37912750244140625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039422493485, + "compression/movement_sparsity/importance_threshold": -5.44607707202502e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9098056637684432, + "compression/movement_sparsity/model_sparsity": 0.8178929501713303, + "compression_loss": 10.581986427307129, + "distillation_loss": 0.70872563123703, + "epoch": 4.79, + "learning_rate": 5.5555555555555545e-05, + "loss": 10.916, + "step": 1910, + "task_loss": 0.49253469705581665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039457343335, + "compression/movement_sparsity/importance_threshold": -5.117431482548813e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9102467136216501, + "compression/movement_sparsity/model_sparsity": 0.8182894431587645, + "compression_loss": 10.589385986328125, + "distillation_loss": 0.6817111372947693, + "epoch": 4.81, + "learning_rate": 5.5847953216374264e-05, + "loss": 10.9246, + "step": 1920, + "task_loss": 0.3340785503387451 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039490762185, + "compression/movement_sparsity/importance_threshold": -4.802280695447394e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9103951299495634, + "compression/movement_sparsity/model_sparsity": 0.8184228657929871, + "compression_loss": 10.596639633178711, + "distillation_loss": 0.5131574869155884, + "epoch": 4.84, + "learning_rate": 5.614035087719298e-05, + "loss": 10.8974, + "step": 1930, + "task_loss": 0.22064465284347534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039522780035, + "compression/movement_sparsity/importance_threshold": -4.500341800817739e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9110764006511592, + "compression/movement_sparsity/model_sparsity": 0.8190353114241624, + "compression_loss": 10.603409767150879, + "distillation_loss": 0.28842249512672424, + "epoch": 4.86, + "learning_rate": 5.643274853801169e-05, + "loss": 10.9209, + "step": 1940, + "task_loss": 0.08667704463005066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039553426885, + "compression/movement_sparsity/importance_threshold": -4.21133188875677e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9118364903267088, + "compression/movement_sparsity/model_sparsity": 0.8197186133774114, + "compression_loss": 10.609901428222656, + "distillation_loss": 0.42135104537010193, + "epoch": 4.89, + "learning_rate": 5.67251461988304e-05, + "loss": 10.9254, + "step": 1950, + "task_loss": 0.21301895380020142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039582732735, + "compression/movement_sparsity/importance_threshold": -3.934968049361463e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9121054934507679, + "compression/movement_sparsity/model_sparsity": 0.8199604405801907, + "compression_loss": 10.616052627563477, + "distillation_loss": 0.816530704498291, + "epoch": 4.91, + "learning_rate": 5.7017543859649115e-05, + "loss": 10.9946, + "step": 1960, + "task_loss": 0.5122554302215576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039610727585, + "compression/movement_sparsity/importance_threshold": -3.670967372728794e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9126333018104487, + "compression/movement_sparsity/model_sparsity": 0.8204349273344679, + "compression_loss": 10.621758460998535, + "distillation_loss": 0.8322756290435791, + "epoch": 4.94, + "learning_rate": 5.730994152046783e-05, + "loss": 10.9709, + "step": 1970, + "task_loss": 0.23779863119125366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039637441435000004, + "compression/movement_sparsity/importance_threshold": -3.4190469489556847e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9131550996499548, + "compression/movement_sparsity/model_sparsity": 0.8209040107787048, + "compression_loss": 10.627117156982422, + "distillation_loss": 1.0545827150344849, + "epoch": 4.96, + "learning_rate": 5.760233918128655e-05, + "loss": 10.953, + "step": 1980, + "task_loss": 0.510939359664917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039662904285, + "compression/movement_sparsity/importance_threshold": -3.17892386813922e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9135569458559168, + "compression/movement_sparsity/model_sparsity": 0.8212652606499664, + "compression_loss": 10.632038116455078, + "distillation_loss": 0.3127363920211792, + "epoch": 4.99, + "learning_rate": 5.7894736842105253e-05, + "loss": 10.8964, + "step": 1990, + "task_loss": 0.22342097759246826 + }, + { + "epoch": 5.0, + "eval_accuracy": 0.9646954986760812, + "eval_loss": 10.7809419631958, + "eval_runtime": 86.6556, + "eval_samples_per_second": 78.448, + "eval_steps_per_second": 2.458, + "step": 1995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03968950432, + "compression/movement_sparsity/importance_threshold": -2.928076757386603e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9139671550361337, + "compression/movement_sparsity/model_sparsity": 0.8216340286297185, + "compression_loss": 10.637024879455566, + "distillation_loss": 0.5320035815238953, + "epoch": 5.01, + "learning_rate": 5.818713450292397e-05, + "loss": 11.2495, + "step": 2000, + "task_loss": 0.23087036609649658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03971243772, + "compression/movement_sparsity/importance_threshold": -2.7118072250444255e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9145193230578139, + "compression/movement_sparsity/model_sparsity": 0.8221304141219473, + "compression_loss": 10.641243934631348, + "distillation_loss": 0.35051363706588745, + "epoch": 5.04, + "learning_rate": 5.847953216374268e-05, + "loss": 10.9442, + "step": 2010, + "task_loss": 0.327785462141037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03973421312, + "compression/movement_sparsity/importance_threshold": -2.5064580149594478e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9151376420882265, + "compression/movement_sparsity/model_sparsity": 0.8226862677465956, + "compression_loss": 10.645480155944824, + "distillation_loss": 0.5606256723403931, + "epoch": 5.06, + "learning_rate": 5.87719298245614e-05, + "loss": 10.9266, + "step": 2020, + "task_loss": 0.20908093452453613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03975486052, + "compression/movement_sparsity/importance_threshold": -2.311746217228646e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9155960177657332, + "compression/movement_sparsity/model_sparsity": 0.8230983362246153, + "compression_loss": 10.649575233459473, + "distillation_loss": 0.3791353106498718, + "epoch": 5.09, + "learning_rate": 5.906432748538011e-05, + "loss": 10.9635, + "step": 2030, + "task_loss": 0.1277802586555481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03977440992, + "compression/movement_sparsity/importance_threshold": -2.1273889219488337e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9161118286096055, + "compression/movement_sparsity/model_sparsity": 0.8235620375067965, + "compression_loss": 10.653324127197266, + "distillation_loss": 0.5789883136749268, + "epoch": 5.11, + "learning_rate": 5.9356725146198824e-05, + "loss": 10.9533, + "step": 2040, + "task_loss": 0.33982402086257935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03979289132, + "compression/movement_sparsity/importance_threshold": -1.9531032192171495e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9163111049947305, + "compression/movement_sparsity/model_sparsity": 0.8237411820835121, + "compression_loss": 10.656829833984375, + "distillation_loss": 0.4787919521331787, + "epoch": 5.14, + "learning_rate": 5.964912280701754e-05, + "loss": 10.9379, + "step": 2050, + "task_loss": 0.1822199821472168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03981033472, + "compression/movement_sparsity/importance_threshold": -1.788606199130407e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9167545896379102, + "compression/movement_sparsity/model_sparsity": 0.8241398638873422, + "compression_loss": 10.659506797790527, + "distillation_loss": 0.22967106103897095, + "epoch": 5.16, + "learning_rate": 5.994152046783625e-05, + "loss": 10.9419, + "step": 2060, + "task_loss": 0.10331019759178162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03982677012, + "compression/movement_sparsity/importance_threshold": -1.6336149517856904e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9174214515582656, + "compression/movement_sparsity/model_sparsity": 0.8247393563780101, + "compression_loss": 10.66145133972168, + "distillation_loss": 0.4336327314376831, + "epoch": 5.19, + "learning_rate": 6.023391812865496e-05, + "loss": 10.9723, + "step": 2070, + "task_loss": 0.14386022090911865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03984222752, + "compression/movement_sparsity/importance_threshold": -1.4878465672798677e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9181478658536585, + "compression/movement_sparsity/model_sparsity": 0.8253923849914446, + "compression_loss": 10.662842750549316, + "distillation_loss": 0.4906628429889679, + "epoch": 5.21, + "learning_rate": 6.052631578947368e-05, + "loss": 10.9974, + "step": 2080, + "task_loss": 0.27048027515411377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039856736920000004, + "compression/movement_sparsity/importance_threshold": -1.3510181357099691e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.919091517521078, + "compression/movement_sparsity/model_sparsity": 0.826240704667762, + "compression_loss": 10.664116859436035, + "distillation_loss": 0.4878236651420593, + "epoch": 5.24, + "learning_rate": 6.081871345029239e-05, + "loss": 10.9883, + "step": 2090, + "task_loss": 0.28648656606674194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03987032832, + "compression/movement_sparsity/importance_threshold": -1.2228467471729707e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9201948537714544, + "compression/movement_sparsity/model_sparsity": 0.8272325768628783, + "compression_loss": 10.664933204650879, + "distillation_loss": 0.8386104106903076, + "epoch": 5.26, + "learning_rate": 6.111111111111111e-05, + "loss": 10.9988, + "step": 2100, + "task_loss": 0.48160696029663086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03988303172, + "compression/movement_sparsity/importance_threshold": -1.1030494917657944e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9208408611863896, + "compression/movement_sparsity/model_sparsity": 0.8278133216652853, + "compression_loss": 10.665301322937012, + "distillation_loss": 0.33076217770576477, + "epoch": 5.29, + "learning_rate": 6.140350877192981e-05, + "loss": 11.017, + "step": 2110, + "task_loss": 0.15396958589553833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03989487712, + "compression/movement_sparsity/importance_threshold": -9.913434595854704e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9214929379328516, + "compression/movement_sparsity/model_sparsity": 0.8283995226476938, + "compression_loss": 10.665474891662598, + "distillation_loss": 0.3862035870552063, + "epoch": 5.31, + "learning_rate": 6.169590643274853e-05, + "loss": 11.0204, + "step": 2120, + "task_loss": 0.1747991442680359 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03990589452, + "compression/movement_sparsity/importance_threshold": -8.874457407288122e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9218310679200542, + "compression/movement_sparsity/model_sparsity": 0.828703493202932, + "compression_loss": 10.665908813476562, + "distillation_loss": 0.6665565967559814, + "epoch": 5.34, + "learning_rate": 6.198830409356724e-05, + "loss": 11.0217, + "step": 2130, + "task_loss": 0.2977946698665619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039916113919999996, + "compression/movement_sparsity/importance_threshold": -7.910734252930128e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9219254954268292, + "compression/movement_sparsity/model_sparsity": 0.8287883812127227, + "compression_loss": 10.666594505310059, + "distillation_loss": 0.5244303941726685, + "epoch": 5.36, + "learning_rate": 6.228070175438596e-05, + "loss": 11.0024, + "step": 2140, + "task_loss": 0.29131343960762024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03992556532, + "compression/movement_sparsity/importance_threshold": -7.019436033747772e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9223934573547125, + "compression/movement_sparsity/model_sparsity": 0.829209067494427, + "compression_loss": 10.667391777038574, + "distillation_loss": 0.7449897527694702, + "epoch": 5.39, + "learning_rate": 6.257309941520466e-05, + "loss": 10.9758, + "step": 2150, + "task_loss": 0.40607041120529175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03993427872, + "compression/movement_sparsity/importance_threshold": -6.197733650712984e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9226109417344174, + "compression/movement_sparsity/model_sparsity": 0.8294045806111472, + "compression_loss": 10.667990684509277, + "distillation_loss": 0.7472751140594482, + "epoch": 5.41, + "learning_rate": 6.286549707602338e-05, + "loss": 10.9848, + "step": 2160, + "task_loss": 0.38769587874412537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03994228412, + "compression/movement_sparsity/importance_threshold": -5.442798004793897e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9229368624849443, + "compression/movement_sparsity/model_sparsity": 0.8296975753624288, + "compression_loss": 10.668244361877441, + "distillation_loss": 0.49155575037002563, + "epoch": 5.44, + "learning_rate": 6.315789473684209e-05, + "loss": 11.0034, + "step": 2170, + "task_loss": 0.18195515871047974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039949611520000004, + "compression/movement_sparsity/importance_threshold": -4.751799996960816e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9234349711118639, + "compression/movement_sparsity/model_sparsity": 0.8301453627862724, + "compression_loss": 10.6676607131958, + "distillation_loss": 0.2057265192270279, + "epoch": 5.46, + "learning_rate": 6.345029239766081e-05, + "loss": 10.9875, + "step": 2180, + "task_loss": 0.06234532594680786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039956290920000005, + "compression/movement_sparsity/importance_threshold": -4.121910528183501e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9240599828741343, + "compression/movement_sparsity/model_sparsity": 0.8307072330125114, + "compression_loss": 10.666702270507812, + "distillation_loss": 0.47685402631759644, + "epoch": 5.49, + "learning_rate": 6.374269005847953e-05, + "loss": 10.9471, + "step": 2190, + "task_loss": 0.41978389024734497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03996235232, + "compression/movement_sparsity/importance_threshold": -3.55030049943117e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9245668543925023, + "compression/movement_sparsity/model_sparsity": 0.8311628980605817, + "compression_loss": 10.665879249572754, + "distillation_loss": 0.6600457429885864, + "epoch": 5.51, + "learning_rate": 6.403508771929823e-05, + "loss": 11.0211, + "step": 2200, + "task_loss": 0.3570174276828766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03996782572, + "compression/movement_sparsity/importance_threshold": -3.0341408116741275e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9246335817524842, + "compression/movement_sparsity/model_sparsity": 0.8312228843186215, + "compression_loss": 10.665406227111816, + "distillation_loss": 0.5320706367492676, + "epoch": 5.54, + "learning_rate": 6.432748538011695e-05, + "loss": 10.9993, + "step": 2210, + "task_loss": 0.2885274887084961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03997274112, + "compression/movement_sparsity/importance_threshold": -2.5706023658810494e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.924920463527552, + "compression/movement_sparsity/model_sparsity": 0.8314807839896222, + "compression_loss": 10.664754867553711, + "distillation_loss": 0.17137180268764496, + "epoch": 5.56, + "learning_rate": 6.461988304093567e-05, + "loss": 10.9232, + "step": 2220, + "task_loss": 0.049120813608169556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03997712852, + "compression/movement_sparsity/importance_threshold": -2.156856063022781e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9257162869805782, + "compression/movement_sparsity/model_sparsity": 0.8321962097313297, + "compression_loss": 10.663785934448242, + "distillation_loss": 0.4174395203590393, + "epoch": 5.59, + "learning_rate": 6.491228070175438e-05, + "loss": 11.0043, + "step": 2230, + "task_loss": 0.13315129280090332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03998101792, + "compression/movement_sparsity/importance_threshold": -1.790072804067999e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9262508115966577, + "compression/movement_sparsity/model_sparsity": 0.8326767342351821, + "compression_loss": 10.662391662597656, + "distillation_loss": 0.4411344826221466, + "epoch": 5.61, + "learning_rate": 6.52046783625731e-05, + "loss": 10.9919, + "step": 2240, + "task_loss": 0.1832524538040161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03998443932, + "compression/movement_sparsity/importance_threshold": -1.4674234899864644e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9265289775293586, + "compression/movement_sparsity/model_sparsity": 0.832926798577925, + "compression_loss": 10.661162376403809, + "distillation_loss": 1.028182864189148, + "epoch": 5.64, + "learning_rate": 6.54970760233918e-05, + "loss": 11.0257, + "step": 2250, + "task_loss": 0.5844765901565552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03998742272, + "compression/movement_sparsity/importance_threshold": -1.1860790217484802e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9265480441696778, + "compression/movement_sparsity/model_sparsity": 0.8329439390193637, + "compression_loss": 10.659642219543457, + "distillation_loss": 0.3779850900173187, + "epoch": 5.66, + "learning_rate": 6.578947368421052e-05, + "loss": 11.0075, + "step": 2260, + "task_loss": 0.11443006992340088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03998999812, + "compression/movement_sparsity/importance_threshold": -9.432103003227232e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9268287154659741, + "compression/movement_sparsity/model_sparsity": 0.833196255622456, + "compression_loss": 10.657720565795898, + "distillation_loss": 0.4198092222213745, + "epoch": 5.69, + "learning_rate": 6.608187134502923e-05, + "loss": 10.9443, + "step": 2270, + "task_loss": 0.11462461948394775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03999219552, + "compression/movement_sparsity/importance_threshold": -7.359882266794963e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9270981773185787, + "compression/movement_sparsity/model_sparsity": 0.8334384952109332, + "compression_loss": 10.655531883239746, + "distillation_loss": 0.9664290547370911, + "epoch": 5.71, + "learning_rate": 6.637426900584795e-05, + "loss": 10.9229, + "step": 2280, + "task_loss": 0.5996133685112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03999404492, + "compression/movement_sparsity/importance_threshold": -5.615837017885605e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9276652128500452, + "compression/movement_sparsity/model_sparsity": 0.8339482462293673, + "compression_loss": 10.65322208404541, + "distillation_loss": 0.7060877680778503, + "epoch": 5.74, + "learning_rate": 6.666666666666666e-05, + "loss": 10.9933, + "step": 2290, + "task_loss": 0.48744356632232666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039995576320000004, + "compression/movement_sparsity/importance_threshold": -4.171676266191346e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9279948151912075, + "compression/movement_sparsity/model_sparsity": 0.8342445506402235, + "compression_loss": 10.65136432647705, + "distillation_loss": 0.8349236249923706, + "epoch": 5.76, + "learning_rate": 6.695906432748538e-05, + "loss": 10.975, + "step": 2300, + "task_loss": 0.5106658935546875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03999681972, + "compression/movement_sparsity/importance_threshold": -2.9991090214152166e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9282819674984945, + "compression/movement_sparsity/model_sparsity": 0.8345026935130461, + "compression_loss": 10.649052619934082, + "distillation_loss": 0.4618738889694214, + "epoch": 5.79, + "learning_rate": 6.725146198830408e-05, + "loss": 10.9074, + "step": 2310, + "task_loss": 0.23405304551124573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03999780512, + "compression/movement_sparsity/importance_threshold": -2.069844293243984e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9287675140206263, + "compression/movement_sparsity/model_sparsity": 0.8349391879131656, + "compression_loss": 10.646767616271973, + "distillation_loss": 0.7326123118400574, + "epoch": 5.81, + "learning_rate": 6.75438596491228e-05, + "loss": 10.9723, + "step": 2320, + "task_loss": 0.3694491982460022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039998562519999996, + "compression/movement_sparsity/importance_threshold": -1.3555910913860998e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9291809695874736, + "compression/movement_sparsity/model_sparsity": 0.8353108743147789, + "compression_loss": 10.644407272338867, + "distillation_loss": 0.7847580909729004, + "epoch": 5.84, + "learning_rate": 6.783625730994151e-05, + "loss": 10.9854, + "step": 2330, + "task_loss": 0.6249511241912842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03999912192, + "compression/movement_sparsity/importance_threshold": -8.280584255229099e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9294438563309244, + "compression/movement_sparsity/model_sparsity": 0.8355472030415877, + "compression_loss": 10.642403602600098, + "distillation_loss": 0.7005610466003418, + "epoch": 5.86, + "learning_rate": 6.812865497076023e-05, + "loss": 10.948, + "step": 2340, + "task_loss": 0.4554665982723236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039999513320000006, + "compression/movement_sparsity/importance_threshold": -4.589553053628657e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9299143824337549, + "compression/movement_sparsity/model_sparsity": 0.8359701944536027, + "compression_loss": 10.640273094177246, + "distillation_loss": 0.43438440561294556, + "epoch": 5.89, + "learning_rate": 6.842105263157893e-05, + "loss": 10.9744, + "step": 2350, + "task_loss": 0.22005879878997803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039999766719999996, + "compression/movement_sparsity/importance_threshold": -2.199907406035765e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9300114329268293, + "compression/movement_sparsity/model_sparsity": 0.8360574404636654, + "compression_loss": 10.638136863708496, + "distillation_loss": 0.37377381324768066, + "epoch": 5.91, + "learning_rate": 6.871345029239765e-05, + "loss": 10.9263, + "step": 2360, + "task_loss": 0.3662358522415161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03999991212, + "compression/movement_sparsity/importance_threshold": -8.287374093180963e-10, + "compression/movement_sparsity/linear_layer_sparsity": 0.9300814890093345, + "compression/movement_sparsity/model_sparsity": 0.836120419161512, + "compression_loss": 10.635852813720703, + "distillation_loss": 0.3352665603160858, + "epoch": 5.94, + "learning_rate": 6.900584795321636e-05, + "loss": 10.9481, + "step": 2370, + "task_loss": 0.12698450684547424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03999997952, + "compression/movement_sparsity/importance_threshold": -1.9313316045174375e-10, + "compression/movement_sparsity/linear_layer_sparsity": 0.9305880076595905, + "compression/movement_sparsity/model_sparsity": 0.836575766989815, + "compression_loss": 10.633511543273926, + "distillation_loss": 0.5461183190345764, + "epoch": 5.96, + "learning_rate": 6.929824561403508e-05, + "loss": 10.9881, + "step": 2380, + "task_loss": 0.2637324929237366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03999999892, + "compression/movement_sparsity/importance_threshold": -1.0184756521220556e-11, + "compression/movement_sparsity/linear_layer_sparsity": 0.9312243817750677, + "compression/movement_sparsity/model_sparsity": 0.8371478516925683, + "compression_loss": 10.631221771240234, + "distillation_loss": 0.5138255953788757, + "epoch": 5.99, + "learning_rate": 6.95906432748538e-05, + "loss": 10.9322, + "step": 2390, + "task_loss": 0.2347862720489502 + }, + { + "epoch": 6.0, + "eval_accuracy": 0.9619005589879376, + "eval_loss": 10.780640602111816, + "eval_runtime": 132.1966, + "eval_samples_per_second": 51.423, + "eval_steps_per_second": 1.611, + "step": 2394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.7983419895172119, + "epoch": 6.02, + "learning_rate": 6.985380116959064e-05, + "loss": 4.89, + "step": 2400, + "task_loss": 0.48241502046585083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.9250924587249756, + "epoch": 6.04, + "learning_rate": 6.985380116959064e-05, + "loss": 0.3853, + "step": 2410, + "task_loss": 0.5363513231277466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 1.459159016609192, + "epoch": 6.07, + "learning_rate": 6.956140350877192e-05, + "loss": 0.4553, + "step": 2420, + "task_loss": 1.2184821367263794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.5406507253646851, + "epoch": 6.09, + "learning_rate": 6.92690058479532e-05, + "loss": 0.4053, + "step": 2430, + "task_loss": 0.29688769578933716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.6764876842498779, + "epoch": 6.12, + "learning_rate": 6.89766081871345e-05, + "loss": 0.3763, + "step": 2440, + "task_loss": 0.7804003953933716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.3237318694591522, + "epoch": 6.14, + "learning_rate": 6.868421052631578e-05, + "loss": 0.4146, + "step": 2450, + "task_loss": 0.24066579341888428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.500560462474823, + "epoch": 6.17, + "learning_rate": 6.839181286549707e-05, + "loss": 0.3607, + "step": 2460, + "task_loss": 0.3094533681869507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.5021952390670776, + "epoch": 6.19, + "learning_rate": 6.809941520467835e-05, + "loss": 0.3754, + "step": 2470, + "task_loss": 0.18106400966644287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.9087671041488647, + "epoch": 6.22, + "learning_rate": 6.780701754385964e-05, + "loss": 0.3213, + "step": 2480, + "task_loss": 0.3978729248046875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.3499097228050232, + "epoch": 6.24, + "learning_rate": 6.751461988304093e-05, + "loss": 0.3437, + "step": 2490, + "task_loss": 0.2311665415763855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.37648361921310425, + "epoch": 6.27, + "learning_rate": 6.722222222222222e-05, + "loss": 0.3357, + "step": 2500, + "task_loss": 0.22226202487945557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.7497131824493408, + "epoch": 6.29, + "learning_rate": 6.69298245614035e-05, + "loss": 0.3155, + "step": 2510, + "task_loss": 0.48474282026290894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.2494376003742218, + "epoch": 6.32, + "learning_rate": 6.66374269005848e-05, + "loss": 0.3078, + "step": 2520, + "task_loss": 0.08828747272491455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.671501874923706, + "epoch": 6.34, + "learning_rate": 6.634502923976607e-05, + "loss": 0.3398, + "step": 2530, + "task_loss": 0.5209366679191589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.33928757905960083, + "epoch": 6.37, + "learning_rate": 6.605263157894737e-05, + "loss": 0.2388, + "step": 2540, + "task_loss": 0.07023358345031738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.7021492719650269, + "epoch": 6.39, + "learning_rate": 6.576023391812865e-05, + "loss": 0.261, + "step": 2550, + "task_loss": 0.38910675048828125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.46911415457725525, + "epoch": 6.42, + "learning_rate": 6.546783625730994e-05, + "loss": 0.2949, + "step": 2560, + "task_loss": 0.2855373024940491 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.4447360336780548, + "epoch": 6.44, + "learning_rate": 6.517543859649122e-05, + "loss": 0.3047, + "step": 2570, + "task_loss": 0.1770896315574646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.4246532618999481, + "epoch": 6.47, + "learning_rate": 6.488304093567252e-05, + "loss": 0.2574, + "step": 2580, + "task_loss": 0.13081282377243042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.2699139416217804, + "epoch": 6.49, + "learning_rate": 6.45906432748538e-05, + "loss": 0.2827, + "step": 2590, + "task_loss": 0.11969354748725891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.1465551257133484, + "epoch": 6.52, + "learning_rate": 6.429824561403508e-05, + "loss": 0.255, + "step": 2600, + "task_loss": 0.04637971520423889 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.38055044412612915, + "epoch": 6.54, + "learning_rate": 6.400584795321637e-05, + "loss": 0.2831, + "step": 2610, + "task_loss": 0.25821614265441895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.6416689157485962, + "epoch": 6.57, + "learning_rate": 6.371345029239765e-05, + "loss": 0.2951, + "step": 2620, + "task_loss": 0.21043294668197632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.9269071221351624, + "epoch": 6.59, + "learning_rate": 6.342105263157895e-05, + "loss": 0.3323, + "step": 2630, + "task_loss": 0.5026609301567078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.6682649850845337, + "epoch": 6.62, + "learning_rate": 6.312865497076023e-05, + "loss": 0.301, + "step": 2640, + "task_loss": 0.4854172170162201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.15332600474357605, + "epoch": 6.64, + "learning_rate": 6.283625730994151e-05, + "loss": 0.2931, + "step": 2650, + "task_loss": 0.07616248726844788 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.6017502546310425, + "epoch": 6.67, + "learning_rate": 6.25438596491228e-05, + "loss": 0.3101, + "step": 2660, + "task_loss": 0.24977391958236694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.49376827478408813, + "epoch": 6.69, + "learning_rate": 6.225146198830408e-05, + "loss": 0.2862, + "step": 2670, + "task_loss": 0.2650887072086334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.5746809244155884, + "epoch": 6.72, + "learning_rate": 6.195906432748538e-05, + "loss": 0.2362, + "step": 2680, + "task_loss": 0.3955618739128113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.1680450141429901, + "epoch": 6.74, + "learning_rate": 6.166666666666666e-05, + "loss": 0.2752, + "step": 2690, + "task_loss": 0.05644118785858154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.267273485660553, + "epoch": 6.77, + "learning_rate": 6.137426900584795e-05, + "loss": 0.2803, + "step": 2700, + "task_loss": 0.13188397884368896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.40438371896743774, + "epoch": 6.79, + "learning_rate": 6.108187134502923e-05, + "loss": 0.3032, + "step": 2710, + "task_loss": 0.1812323033809662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.35020095109939575, + "epoch": 6.82, + "learning_rate": 6.0789473684210525e-05, + "loss": 0.2925, + "step": 2720, + "task_loss": 0.12603074312210083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.17570018768310547, + "epoch": 6.84, + "learning_rate": 6.0497076023391806e-05, + "loss": 0.2099, + "step": 2730, + "task_loss": 0.021570265293121338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.9918994903564453, + "epoch": 6.87, + "learning_rate": 6.02046783625731e-05, + "loss": 0.2642, + "step": 2740, + "task_loss": 0.6132215261459351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.351323664188385, + "epoch": 6.89, + "learning_rate": 5.991228070175438e-05, + "loss": 0.2622, + "step": 2750, + "task_loss": 0.13521471619606018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.2853711247444153, + "epoch": 6.92, + "learning_rate": 5.961988304093567e-05, + "loss": 0.3161, + "step": 2760, + "task_loss": 0.17914897203445435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.4238589107990265, + "epoch": 6.94, + "learning_rate": 5.9327485380116955e-05, + "loss": 0.2448, + "step": 2770, + "task_loss": 0.25172996520996094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.557287335395813, + "epoch": 6.97, + "learning_rate": 5.903508771929824e-05, + "loss": 0.294, + "step": 2780, + "task_loss": 0.25702038407325745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.5326415300369263, + "epoch": 6.99, + "learning_rate": 5.874269005847952e-05, + "loss": 0.2389, + "step": 2790, + "task_loss": 0.37412598729133606 + }, + { + "epoch": 7.0, + "eval_accuracy": 0.9738158281847602, + "eval_loss": 0.114750936627388, + "eval_runtime": 88.7864, + "eval_samples_per_second": 76.566, + "eval_steps_per_second": 2.399, + "step": 2793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.4145723879337311, + "epoch": 7.02, + "learning_rate": 5.8450292397660816e-05, + "loss": 0.292, + "step": 2800, + "task_loss": 0.2053484320640564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.5730660557746887, + "epoch": 7.04, + "learning_rate": 5.81578947368421e-05, + "loss": 0.2816, + "step": 2810, + "task_loss": 0.3073492646217346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.22465837001800537, + "epoch": 7.07, + "learning_rate": 5.786549707602339e-05, + "loss": 0.2321, + "step": 2820, + "task_loss": 0.083594411611557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.38710612058639526, + "epoch": 7.09, + "learning_rate": 5.757309941520467e-05, + "loss": 0.2502, + "step": 2830, + "task_loss": 0.22288838028907776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.4200671911239624, + "epoch": 7.12, + "learning_rate": 5.7280701754385965e-05, + "loss": 0.2209, + "step": 2840, + "task_loss": 0.18464124202728271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.5526059865951538, + "epoch": 7.14, + "learning_rate": 5.6988304093567246e-05, + "loss": 0.2381, + "step": 2850, + "task_loss": 0.27178797125816345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.8992462158203125, + "epoch": 7.17, + "learning_rate": 5.669590643274853e-05, + "loss": 0.2418, + "step": 2860, + "task_loss": 0.7048245668411255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.4709673821926117, + "epoch": 7.19, + "learning_rate": 5.640350877192982e-05, + "loss": 0.2539, + "step": 2870, + "task_loss": 0.2595267593860626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.4535144865512848, + "epoch": 7.22, + "learning_rate": 5.611111111111111e-05, + "loss": 0.2421, + "step": 2880, + "task_loss": 0.19773799180984497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.5013654232025146, + "epoch": 7.24, + "learning_rate": 5.5818713450292395e-05, + "loss": 0.249, + "step": 2890, + "task_loss": 0.19651299715042114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 1.160736322402954, + "epoch": 7.27, + "learning_rate": 5.552631578947368e-05, + "loss": 0.2468, + "step": 2900, + "task_loss": 0.7461255192756653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.2139417827129364, + "epoch": 7.29, + "learning_rate": 5.523391812865496e-05, + "loss": 0.2224, + "step": 2910, + "task_loss": 0.07157644629478455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.6396937966346741, + "epoch": 7.32, + "learning_rate": 5.494152046783626e-05, + "loss": 0.2347, + "step": 2920, + "task_loss": 0.519213080406189 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.27051806449890137, + "epoch": 7.34, + "learning_rate": 5.464912280701754e-05, + "loss": 0.2581, + "step": 2930, + "task_loss": 0.06110638380050659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.2813902497291565, + "epoch": 7.37, + "learning_rate": 5.435672514619883e-05, + "loss": 0.2618, + "step": 2940, + "task_loss": 0.2888513207435608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.6568804979324341, + "epoch": 7.39, + "learning_rate": 5.406432748538011e-05, + "loss": 0.2664, + "step": 2950, + "task_loss": 0.4726387858390808 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.3869742453098297, + "epoch": 7.42, + "learning_rate": 5.37719298245614e-05, + "loss": 0.2588, + "step": 2960, + "task_loss": 0.3070613741874695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.3587968349456787, + "epoch": 7.44, + "learning_rate": 5.3479532163742686e-05, + "loss": 0.2533, + "step": 2970, + "task_loss": 0.12952539324760437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.20623968541622162, + "epoch": 7.47, + "learning_rate": 5.3187134502923973e-05, + "loss": 0.2429, + "step": 2980, + "task_loss": 0.050109267234802246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.3525960445404053, + "epoch": 7.49, + "learning_rate": 5.289473684210526e-05, + "loss": 0.2101, + "step": 2990, + "task_loss": 0.15648704767227173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.5383965373039246, + "epoch": 7.52, + "learning_rate": 5.260233918128655e-05, + "loss": 0.2462, + "step": 3000, + "task_loss": 0.2703443765640259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.24444815516471863, + "epoch": 7.54, + "learning_rate": 5.230994152046783e-05, + "loss": 0.2791, + "step": 3010, + "task_loss": 0.074734628200531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.8547993898391724, + "epoch": 7.57, + "learning_rate": 5.201754385964912e-05, + "loss": 0.2289, + "step": 3020, + "task_loss": 0.37249431014060974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.41006985306739807, + "epoch": 7.59, + "learning_rate": 5.17251461988304e-05, + "loss": 0.2392, + "step": 3030, + "task_loss": 0.18442684412002563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.15326108038425446, + "epoch": 7.62, + "learning_rate": 5.14327485380117e-05, + "loss": 0.2821, + "step": 3040, + "task_loss": 0.2564823031425476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.5139175653457642, + "epoch": 7.64, + "learning_rate": 5.114035087719298e-05, + "loss": 0.2611, + "step": 3050, + "task_loss": 0.3527417778968811 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.36897873878479004, + "epoch": 7.67, + "learning_rate": 5.0847953216374265e-05, + "loss": 0.2883, + "step": 3060, + "task_loss": 0.4354107975959778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.24441149830818176, + "epoch": 7.69, + "learning_rate": 5.055555555555555e-05, + "loss": 0.2414, + "step": 3070, + "task_loss": 0.05061835050582886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.4979506731033325, + "epoch": 7.72, + "learning_rate": 5.026315789473684e-05, + "loss": 0.2641, + "step": 3080, + "task_loss": 0.28660231828689575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.23474541306495667, + "epoch": 7.74, + "learning_rate": 4.9970760233918126e-05, + "loss": 0.263, + "step": 3090, + "task_loss": 0.08172953128814697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.3658486604690552, + "epoch": 7.77, + "learning_rate": 4.9678362573099414e-05, + "loss": 0.2276, + "step": 3100, + "task_loss": 0.06395676732063293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.6114939451217651, + "epoch": 7.79, + "learning_rate": 4.9385964912280694e-05, + "loss": 0.2165, + "step": 3110, + "task_loss": 0.21138450503349304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.42987892031669617, + "epoch": 7.82, + "learning_rate": 4.909356725146199e-05, + "loss": 0.2587, + "step": 3120, + "task_loss": 0.20042163133621216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.34050965309143066, + "epoch": 7.84, + "learning_rate": 4.880116959064327e-05, + "loss": 0.2114, + "step": 3130, + "task_loss": 0.15165340900421143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.2943706512451172, + "epoch": 7.87, + "learning_rate": 4.850877192982456e-05, + "loss": 0.217, + "step": 3140, + "task_loss": 0.12199932336807251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.5021252632141113, + "epoch": 7.89, + "learning_rate": 4.821637426900584e-05, + "loss": 0.2315, + "step": 3150, + "task_loss": 0.12248951196670532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.23276600241661072, + "epoch": 7.92, + "learning_rate": 4.792397660818713e-05, + "loss": 0.195, + "step": 3160, + "task_loss": 0.04651379585266113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.2711644470691681, + "epoch": 7.94, + "learning_rate": 4.763157894736842e-05, + "loss": 0.2542, + "step": 3170, + "task_loss": 0.2954481244087219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.6499089598655701, + "epoch": 7.97, + "learning_rate": 4.7339181286549705e-05, + "loss": 0.2303, + "step": 3180, + "task_loss": 0.2569674849510193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.26821741461753845, + "epoch": 7.99, + "learning_rate": 4.704678362573099e-05, + "loss": 0.2522, + "step": 3190, + "task_loss": 0.07824259996414185 + }, + { + "epoch": 8.0, + "eval_accuracy": 0.9746984407178582, + "eval_loss": 0.10132193565368652, + "eval_runtime": 90.0245, + "eval_samples_per_second": 75.513, + "eval_steps_per_second": 2.366, + "step": 3192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.5839666128158569, + "epoch": 8.02, + "learning_rate": 4.675438596491228e-05, + "loss": 0.2727, + "step": 3200, + "task_loss": 0.2630302906036377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.6127955317497253, + "epoch": 8.05, + "learning_rate": 4.646198830409356e-05, + "loss": 0.2301, + "step": 3210, + "task_loss": 0.3263406455516815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.16742965579032898, + "epoch": 8.07, + "learning_rate": 4.6169590643274854e-05, + "loss": 0.2284, + "step": 3220, + "task_loss": 0.033839792013168335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.34027671813964844, + "epoch": 8.1, + "learning_rate": 4.5877192982456134e-05, + "loss": 0.2362, + "step": 3230, + "task_loss": 0.0935179591178894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.44616079330444336, + "epoch": 8.12, + "learning_rate": 4.558479532163743e-05, + "loss": 0.2615, + "step": 3240, + "task_loss": 0.1582067608833313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.3368973135948181, + "epoch": 8.15, + "learning_rate": 4.529239766081871e-05, + "loss": 0.2173, + "step": 3250, + "task_loss": 0.11782985925674438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.325298547744751, + "epoch": 8.17, + "learning_rate": 4.4999999999999996e-05, + "loss": 0.2466, + "step": 3260, + "task_loss": 0.2142500877380371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.5337823629379272, + "epoch": 8.2, + "learning_rate": 4.470760233918128e-05, + "loss": 0.2397, + "step": 3270, + "task_loss": 0.41766586899757385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.8093789219856262, + "epoch": 8.22, + "learning_rate": 4.441520467836257e-05, + "loss": 0.2337, + "step": 3280, + "task_loss": 0.43245404958724976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.3988366723060608, + "epoch": 8.25, + "learning_rate": 4.412280701754386e-05, + "loss": 0.2029, + "step": 3290, + "task_loss": 0.3944489657878876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.193750262260437, + "epoch": 8.27, + "learning_rate": 4.3830409356725145e-05, + "loss": 0.2072, + "step": 3300, + "task_loss": 0.13878530263900757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.4989054203033447, + "epoch": 8.3, + "learning_rate": 4.3538011695906426e-05, + "loss": 0.2374, + "step": 3310, + "task_loss": 0.35233354568481445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.47747933864593506, + "epoch": 8.32, + "learning_rate": 4.324561403508772e-05, + "loss": 0.194, + "step": 3320, + "task_loss": 0.23924636840820312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.5551918745040894, + "epoch": 8.35, + "learning_rate": 4.2953216374269e-05, + "loss": 0.177, + "step": 3330, + "task_loss": 0.3308478593826294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.3261679410934448, + "epoch": 8.37, + "learning_rate": 4.2660818713450294e-05, + "loss": 0.227, + "step": 3340, + "task_loss": 0.18861651420593262 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.13686245679855347, + "epoch": 8.4, + "learning_rate": 4.2368421052631575e-05, + "loss": 0.2417, + "step": 3350, + "task_loss": 0.10923665761947632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.2956581115722656, + "epoch": 8.42, + "learning_rate": 4.207602339181287e-05, + "loss": 0.2169, + "step": 3360, + "task_loss": 0.08555316925048828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.419700562953949, + "epoch": 8.45, + "learning_rate": 4.178362573099415e-05, + "loss": 0.1994, + "step": 3370, + "task_loss": 0.14956381916999817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.21010534465312958, + "epoch": 8.47, + "learning_rate": 4.1491228070175436e-05, + "loss": 0.2124, + "step": 3380, + "task_loss": 0.10485100746154785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.26641708612442017, + "epoch": 8.5, + "learning_rate": 4.1198830409356724e-05, + "loss": 0.2463, + "step": 3390, + "task_loss": 0.08828288316726685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.4014192819595337, + "epoch": 8.52, + "learning_rate": 4.090643274853801e-05, + "loss": 0.2073, + "step": 3400, + "task_loss": 0.16874057054519653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.4832579791545868, + "epoch": 8.55, + "learning_rate": 4.061403508771929e-05, + "loss": 0.2369, + "step": 3410, + "task_loss": 0.3887079060077667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.30582836270332336, + "epoch": 8.57, + "learning_rate": 4.0321637426900585e-05, + "loss": 0.2327, + "step": 3420, + "task_loss": 0.2814703583717346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.6271736025810242, + "epoch": 8.6, + "learning_rate": 4.0029239766081866e-05, + "loss": 0.2156, + "step": 3430, + "task_loss": 0.34107378125190735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.37824779748916626, + "epoch": 8.62, + "learning_rate": 3.973684210526316e-05, + "loss": 0.2529, + "step": 3440, + "task_loss": 0.20320969820022583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.34715747833251953, + "epoch": 8.65, + "learning_rate": 3.944444444444444e-05, + "loss": 0.226, + "step": 3450, + "task_loss": 0.23775744438171387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.48077207803726196, + "epoch": 8.67, + "learning_rate": 3.9152046783625734e-05, + "loss": 0.2313, + "step": 3460, + "task_loss": 0.23516058921813965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.48644116520881653, + "epoch": 8.7, + "learning_rate": 3.8859649122807015e-05, + "loss": 0.1951, + "step": 3470, + "task_loss": 0.27591028809547424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.30401456356048584, + "epoch": 8.72, + "learning_rate": 3.85672514619883e-05, + "loss": 0.2079, + "step": 3480, + "task_loss": 0.19322288036346436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.08988785743713379, + "epoch": 8.75, + "learning_rate": 3.827485380116959e-05, + "loss": 0.2213, + "step": 3490, + "task_loss": 0.25623592734336853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.15177589654922485, + "epoch": 8.77, + "learning_rate": 3.7982456140350876e-05, + "loss": 0.2061, + "step": 3500, + "task_loss": 0.15864485502243042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.34355148673057556, + "epoch": 8.8, + "learning_rate": 3.769005847953216e-05, + "loss": 0.2517, + "step": 3510, + "task_loss": 0.140297532081604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.7358225584030151, + "epoch": 8.82, + "learning_rate": 3.739766081871345e-05, + "loss": 0.2746, + "step": 3520, + "task_loss": 0.26640784740448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.4418618083000183, + "epoch": 8.85, + "learning_rate": 3.710526315789473e-05, + "loss": 0.2479, + "step": 3530, + "task_loss": 0.22608336806297302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.5399448275566101, + "epoch": 8.87, + "learning_rate": 3.6812865497076025e-05, + "loss": 0.1936, + "step": 3540, + "task_loss": 0.28722792863845825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.09100654721260071, + "epoch": 8.9, + "learning_rate": 3.6520467836257306e-05, + "loss": 0.2135, + "step": 3550, + "task_loss": 0.15959030389785767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.11753946542739868, + "epoch": 8.92, + "learning_rate": 3.62280701754386e-05, + "loss": 0.1941, + "step": 3560, + "task_loss": 0.036947041749954224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.4342065751552582, + "epoch": 8.95, + "learning_rate": 3.593567251461988e-05, + "loss": 0.2682, + "step": 3570, + "task_loss": 0.2372136116027832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.38562774658203125, + "epoch": 8.97, + "learning_rate": 3.564327485380117e-05, + "loss": 0.2453, + "step": 3580, + "task_loss": 0.3832007050514221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.2638232707977295, + "epoch": 9.0, + "learning_rate": 3.5350877192982455e-05, + "loss": 0.2213, + "step": 3590, + "task_loss": 0.0510326623916626 + }, + { + "epoch": 9.0, + "eval_accuracy": 0.9754339511621065, + "eval_loss": 0.09834744036197662, + "eval_runtime": 88.3371, + "eval_samples_per_second": 76.955, + "eval_steps_per_second": 2.411, + "step": 3591 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.6044875383377075, + "epoch": 9.02, + "learning_rate": 3.505847953216374e-05, + "loss": 0.2061, + "step": 3600, + "task_loss": 0.29902324080467224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.48518362641334534, + "epoch": 9.05, + "learning_rate": 3.476608187134503e-05, + "loss": 0.1969, + "step": 3610, + "task_loss": 0.1398446261882782 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.5704216957092285, + "epoch": 9.07, + "learning_rate": 3.447368421052631e-05, + "loss": 0.1819, + "step": 3620, + "task_loss": 0.2688668966293335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.2637028098106384, + "epoch": 9.1, + "learning_rate": 3.41812865497076e-05, + "loss": 0.23, + "step": 3630, + "task_loss": 0.28141531348228455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.34086185693740845, + "epoch": 9.12, + "learning_rate": 3.3888888888888884e-05, + "loss": 0.2153, + "step": 3640, + "task_loss": 0.3492887616157532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.22228632867336273, + "epoch": 9.15, + "learning_rate": 3.359649122807017e-05, + "loss": 0.2178, + "step": 3650, + "task_loss": 0.24493113160133362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.31930631399154663, + "epoch": 9.17, + "learning_rate": 3.330409356725146e-05, + "loss": 0.2436, + "step": 3660, + "task_loss": 0.21123933792114258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.2261628806591034, + "epoch": 9.2, + "learning_rate": 3.3011695906432746e-05, + "loss": 0.2135, + "step": 3670, + "task_loss": 0.05479854345321655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.20525804162025452, + "epoch": 9.22, + "learning_rate": 3.2719298245614033e-05, + "loss": 0.1785, + "step": 3680, + "task_loss": 0.15875384211540222 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.17571699619293213, + "epoch": 9.25, + "learning_rate": 3.242690058479532e-05, + "loss": 0.2115, + "step": 3690, + "task_loss": 0.05111098289489746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.07689700275659561, + "epoch": 9.27, + "learning_rate": 3.213450292397661e-05, + "loss": 0.2062, + "step": 3700, + "task_loss": 0.21406108140945435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.05007719248533249, + "epoch": 9.3, + "learning_rate": 3.1842105263157895e-05, + "loss": 0.2535, + "step": 3710, + "task_loss": 0.1095457673072815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.3887738585472107, + "epoch": 9.32, + "learning_rate": 3.1549707602339176e-05, + "loss": 0.2004, + "step": 3720, + "task_loss": 0.20141488313674927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.2081771194934845, + "epoch": 9.35, + "learning_rate": 3.125730994152046e-05, + "loss": 0.1698, + "step": 3730, + "task_loss": 0.06815099716186523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.13659320771694183, + "epoch": 9.37, + "learning_rate": 3.096491228070175e-05, + "loss": 0.1826, + "step": 3740, + "task_loss": 0.049779534339904785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.46494409441947937, + "epoch": 9.4, + "learning_rate": 3.067251461988304e-05, + "loss": 0.2115, + "step": 3750, + "task_loss": 0.26856356859207153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.11897766590118408, + "epoch": 9.42, + "learning_rate": 3.0380116959064325e-05, + "loss": 0.1778, + "step": 3760, + "task_loss": 0.02882954478263855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.31966108083724976, + "epoch": 9.45, + "learning_rate": 3.0087719298245612e-05, + "loss": 0.2114, + "step": 3770, + "task_loss": 0.30511146783828735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.38032066822052, + "epoch": 9.47, + "learning_rate": 2.97953216374269e-05, + "loss": 0.197, + "step": 3780, + "task_loss": 0.19982492923736572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.2971644699573517, + "epoch": 9.5, + "learning_rate": 2.9502923976608186e-05, + "loss": 0.1968, + "step": 3790, + "task_loss": 0.1379544734954834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.30849239230155945, + "epoch": 9.52, + "learning_rate": 2.921052631578947e-05, + "loss": 0.17, + "step": 3800, + "task_loss": 0.27317845821380615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.1406727433204651, + "epoch": 9.55, + "learning_rate": 2.8918128654970757e-05, + "loss": 0.1798, + "step": 3810, + "task_loss": 0.06477174162864685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.39964333176612854, + "epoch": 9.57, + "learning_rate": 2.8625730994152045e-05, + "loss": 0.1924, + "step": 3820, + "task_loss": 0.2196149230003357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.24219584465026855, + "epoch": 9.6, + "learning_rate": 2.8333333333333332e-05, + "loss": 0.1992, + "step": 3830, + "task_loss": 0.17496830224990845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.662438690662384, + "epoch": 9.62, + "learning_rate": 2.804093567251462e-05, + "loss": 0.176, + "step": 3840, + "task_loss": 0.4713020324707031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.5137396454811096, + "epoch": 9.65, + "learning_rate": 2.7748538011695903e-05, + "loss": 0.1907, + "step": 3850, + "task_loss": 0.2300821840763092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.23515579104423523, + "epoch": 9.67, + "learning_rate": 2.745614035087719e-05, + "loss": 0.1941, + "step": 3860, + "task_loss": 0.13494980335235596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.33846205472946167, + "epoch": 9.7, + "learning_rate": 2.7163742690058478e-05, + "loss": 0.1983, + "step": 3870, + "task_loss": 0.23276156187057495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.2716241478919983, + "epoch": 9.72, + "learning_rate": 2.6871345029239765e-05, + "loss": 0.1765, + "step": 3880, + "task_loss": 0.3164524734020233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.38798290491104126, + "epoch": 9.75, + "learning_rate": 2.6578947368421052e-05, + "loss": 0.2094, + "step": 3890, + "task_loss": 0.1473989486694336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.15159770846366882, + "epoch": 9.77, + "learning_rate": 2.6286549707602336e-05, + "loss": 0.2493, + "step": 3900, + "task_loss": 0.05544543266296387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.34458696842193604, + "epoch": 9.8, + "learning_rate": 2.5994152046783623e-05, + "loss": 0.1892, + "step": 3910, + "task_loss": 0.24102401733398438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.20027554035186768, + "epoch": 9.82, + "learning_rate": 2.570175438596491e-05, + "loss": 0.1815, + "step": 3920, + "task_loss": 0.18737459182739258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.25017499923706055, + "epoch": 9.85, + "learning_rate": 2.5409356725146198e-05, + "loss": 0.1988, + "step": 3930, + "task_loss": 0.13370424509048462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.26910173892974854, + "epoch": 9.87, + "learning_rate": 2.5116959064327485e-05, + "loss": 0.1978, + "step": 3940, + "task_loss": 0.10323816537857056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.7278082370758057, + "epoch": 9.9, + "learning_rate": 2.482456140350877e-05, + "loss": 0.1861, + "step": 3950, + "task_loss": 0.40188902616500854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.4854397177696228, + "epoch": 9.92, + "learning_rate": 2.4532163742690056e-05, + "loss": 0.217, + "step": 3960, + "task_loss": 0.2801436185836792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.359090119600296, + "epoch": 9.95, + "learning_rate": 2.4239766081871343e-05, + "loss": 0.1845, + "step": 3970, + "task_loss": 0.23310816287994385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.3712664842605591, + "epoch": 9.97, + "learning_rate": 2.394736842105263e-05, + "loss": 0.2111, + "step": 3980, + "task_loss": 0.3080166280269623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.3844277858734131, + "epoch": 10.0, + "learning_rate": 2.3654970760233918e-05, + "loss": 0.2053, + "step": 3990, + "task_loss": 0.18040329217910767 + }, + { + "epoch": 10.0, + "eval_accuracy": 0.9767578699617535, + "eval_loss": 0.09344575554132462, + "eval_runtime": 109.4349, + "eval_samples_per_second": 62.119, + "eval_steps_per_second": 1.946, + "step": 3990 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.3194921910762787, + "epoch": 10.03, + "learning_rate": 2.33625730994152e-05, + "loss": 0.2147, + "step": 4000, + "task_loss": 0.08249035477638245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.7327070236206055, + "epoch": 10.05, + "learning_rate": 2.307017543859649e-05, + "loss": 0.2009, + "step": 4010, + "task_loss": 0.40034812688827515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.2106877863407135, + "epoch": 10.08, + "learning_rate": 2.2777777777777776e-05, + "loss": 0.2072, + "step": 4020, + "task_loss": 0.25867098569869995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.27913522720336914, + "epoch": 10.1, + "learning_rate": 2.2485380116959063e-05, + "loss": 0.2391, + "step": 4030, + "task_loss": 0.2785712480545044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.14740639925003052, + "epoch": 10.13, + "learning_rate": 2.219298245614035e-05, + "loss": 0.1665, + "step": 4040, + "task_loss": 0.03724539279937744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.03694353997707367, + "epoch": 10.15, + "learning_rate": 2.1900584795321638e-05, + "loss": 0.1724, + "step": 4050, + "task_loss": 0.03579223155975342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.48803913593292236, + "epoch": 10.18, + "learning_rate": 2.1608187134502922e-05, + "loss": 0.1803, + "step": 4060, + "task_loss": 0.305889368057251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.29847007989883423, + "epoch": 10.2, + "learning_rate": 2.131578947368421e-05, + "loss": 0.1663, + "step": 4070, + "task_loss": 0.15580615401268005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.3054282069206238, + "epoch": 10.23, + "learning_rate": 2.1023391812865496e-05, + "loss": 0.1386, + "step": 4080, + "task_loss": 0.20085352659225464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.3656499683856964, + "epoch": 10.25, + "learning_rate": 2.0730994152046784e-05, + "loss": 0.2038, + "step": 4090, + "task_loss": 0.06400293111801147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.27491533756256104, + "epoch": 10.28, + "learning_rate": 2.043859649122807e-05, + "loss": 0.2023, + "step": 4100, + "task_loss": 0.0929376482963562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.1880778968334198, + "epoch": 10.3, + "learning_rate": 2.0146198830409355e-05, + "loss": 0.1673, + "step": 4110, + "task_loss": 0.06569665670394897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.36783143877983093, + "epoch": 10.33, + "learning_rate": 1.9853801169590642e-05, + "loss": 0.1728, + "step": 4120, + "task_loss": 0.22449851036071777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.4092685282230377, + "epoch": 10.35, + "learning_rate": 1.956140350877193e-05, + "loss": 0.1937, + "step": 4130, + "task_loss": 0.14843645691871643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.1534067690372467, + "epoch": 10.38, + "learning_rate": 1.9269005847953216e-05, + "loss": 0.1773, + "step": 4140, + "task_loss": 0.0629468560218811 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.3426051735877991, + "epoch": 10.4, + "learning_rate": 1.8976608187134504e-05, + "loss": 0.1967, + "step": 4150, + "task_loss": 0.18895089626312256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.2175091803073883, + "epoch": 10.43, + "learning_rate": 1.8684210526315787e-05, + "loss": 0.1879, + "step": 4160, + "task_loss": 0.04404401779174805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.2926103472709656, + "epoch": 10.45, + "learning_rate": 1.8391812865497075e-05, + "loss": 0.1696, + "step": 4170, + "task_loss": 0.06125068664550781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.11740519106388092, + "epoch": 10.48, + "learning_rate": 1.8099415204678362e-05, + "loss": 0.1897, + "step": 4180, + "task_loss": 0.18562465906143188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.06711611151695251, + "epoch": 10.5, + "learning_rate": 1.780701754385965e-05, + "loss": 0.14, + "step": 4190, + "task_loss": 0.010339558124542236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.31951457262039185, + "epoch": 10.53, + "learning_rate": 1.7514619883040936e-05, + "loss": 0.1742, + "step": 4200, + "task_loss": 0.17789161205291748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.24925366044044495, + "epoch": 10.55, + "learning_rate": 1.722222222222222e-05, + "loss": 0.1216, + "step": 4210, + "task_loss": 0.11909815669059753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.32841724157333374, + "epoch": 10.58, + "learning_rate": 1.6929824561403508e-05, + "loss": 0.1491, + "step": 4220, + "task_loss": 0.07813969254493713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.21618331968784332, + "epoch": 10.6, + "learning_rate": 1.663742690058479e-05, + "loss": 0.1802, + "step": 4230, + "task_loss": 0.04656791687011719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.5704886317253113, + "epoch": 10.63, + "learning_rate": 1.634502923976608e-05, + "loss": 0.1595, + "step": 4240, + "task_loss": 0.33919835090637207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.03302011638879776, + "epoch": 10.65, + "learning_rate": 1.6052631578947366e-05, + "loss": 0.1652, + "step": 4250, + "task_loss": 0.03273957967758179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.2786880135536194, + "epoch": 10.68, + "learning_rate": 1.5760233918128653e-05, + "loss": 0.1777, + "step": 4260, + "task_loss": 0.08011233806610107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.28394752740859985, + "epoch": 10.7, + "learning_rate": 1.546783625730994e-05, + "loss": 0.1657, + "step": 4270, + "task_loss": 0.11078321933746338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.19141581654548645, + "epoch": 10.73, + "learning_rate": 1.5175438596491226e-05, + "loss": 0.1342, + "step": 4280, + "task_loss": 0.11384612321853638 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.19070206582546234, + "epoch": 10.75, + "learning_rate": 1.4883040935672513e-05, + "loss": 0.1618, + "step": 4290, + "task_loss": 0.03678613901138306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.3137202858924866, + "epoch": 10.78, + "learning_rate": 1.4590643274853799e-05, + "loss": 0.1616, + "step": 4300, + "task_loss": 0.40499967336654663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.31758958101272583, + "epoch": 10.8, + "learning_rate": 1.4298245614035086e-05, + "loss": 0.1941, + "step": 4310, + "task_loss": 0.08149957656860352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.08811835944652557, + "epoch": 10.83, + "learning_rate": 1.4005847953216372e-05, + "loss": 0.2147, + "step": 4320, + "task_loss": 0.07098215818405151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.1679392009973526, + "epoch": 10.85, + "learning_rate": 1.3713450292397659e-05, + "loss": 0.164, + "step": 4330, + "task_loss": 0.15692710876464844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.3241733908653259, + "epoch": 10.88, + "learning_rate": 1.3421052631578946e-05, + "loss": 0.1727, + "step": 4340, + "task_loss": 0.18366020917892456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.5590642690658569, + "epoch": 10.9, + "learning_rate": 1.3128654970760232e-05, + "loss": 0.1559, + "step": 4350, + "task_loss": 0.4202096462249756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.09962806850671768, + "epoch": 10.93, + "learning_rate": 1.2836257309941519e-05, + "loss": 0.1921, + "step": 4360, + "task_loss": 0.055821776390075684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.12555056810379028, + "epoch": 10.95, + "learning_rate": 1.2543859649122804e-05, + "loss": 0.1507, + "step": 4370, + "task_loss": 0.17544305324554443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.11806661635637283, + "epoch": 10.98, + "learning_rate": 1.2251461988304092e-05, + "loss": 0.1543, + "step": 4380, + "task_loss": 0.05767279863357544 + }, + { + "epoch": 11.0, + "eval_accuracy": 0.9779346866725508, + "eval_loss": 0.08748478442430496, + "eval_runtime": 88.9404, + "eval_samples_per_second": 76.433, + "eval_steps_per_second": 2.395, + "step": 4389 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.19886839389801025, + "epoch": 11.0, + "learning_rate": 1.1959064327485379e-05, + "loss": 0.1981, + "step": 4390, + "task_loss": 0.26288312673568726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.11590077728033066, + "epoch": 11.03, + "learning_rate": 1.1666666666666665e-05, + "loss": 0.1666, + "step": 4400, + "task_loss": 0.06784418225288391 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.21343311667442322, + "epoch": 11.05, + "learning_rate": 1.1374269005847952e-05, + "loss": 0.1697, + "step": 4410, + "task_loss": 0.0758419930934906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.30859267711639404, + "epoch": 11.08, + "learning_rate": 1.1081871345029239e-05, + "loss": 0.1896, + "step": 4420, + "task_loss": 0.07110640406608582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.2189975082874298, + "epoch": 11.1, + "learning_rate": 1.0789473684210525e-05, + "loss": 0.1926, + "step": 4430, + "task_loss": 0.05106937885284424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.39309024810791016, + "epoch": 11.13, + "learning_rate": 1.0497076023391812e-05, + "loss": 0.1556, + "step": 4440, + "task_loss": 0.23538663983345032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.5418274998664856, + "epoch": 11.15, + "learning_rate": 1.0204678362573097e-05, + "loss": 0.1725, + "step": 4450, + "task_loss": 0.2049427330493927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.5003339648246765, + "epoch": 11.18, + "learning_rate": 9.912280701754385e-06, + "loss": 0.1857, + "step": 4460, + "task_loss": 0.20348548889160156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.20201507210731506, + "epoch": 11.2, + "learning_rate": 9.619883040935672e-06, + "loss": 0.1821, + "step": 4470, + "task_loss": 0.06827902793884277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.4064372181892395, + "epoch": 11.23, + "learning_rate": 9.327485380116957e-06, + "loss": 0.1734, + "step": 4480, + "task_loss": 0.2290695309638977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.25272685289382935, + "epoch": 11.25, + "learning_rate": 9.035087719298245e-06, + "loss": 0.1609, + "step": 4490, + "task_loss": 0.09427431225776672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.35564708709716797, + "epoch": 11.28, + "learning_rate": 8.74269005847953e-06, + "loss": 0.1769, + "step": 4500, + "task_loss": 0.15388613939285278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.13362084329128265, + "epoch": 11.3, + "learning_rate": 8.450292397660817e-06, + "loss": 0.1779, + "step": 4510, + "task_loss": 0.07664147019386292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.05402388423681259, + "epoch": 11.33, + "learning_rate": 8.157894736842105e-06, + "loss": 0.1608, + "step": 4520, + "task_loss": 0.013635486364364624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.47015678882598877, + "epoch": 11.35, + "learning_rate": 7.86549707602339e-06, + "loss": 0.1832, + "step": 4530, + "task_loss": 0.17718610167503357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.36746612191200256, + "epoch": 11.38, + "learning_rate": 7.5730994152046775e-06, + "loss": 0.1559, + "step": 4540, + "task_loss": 0.26919397711753845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.3066258430480957, + "epoch": 11.4, + "learning_rate": 7.280701754385964e-06, + "loss": 0.1626, + "step": 4550, + "task_loss": 0.08805280923843384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.21340705454349518, + "epoch": 11.43, + "learning_rate": 6.98830409356725e-06, + "loss": 0.1768, + "step": 4560, + "task_loss": 0.16585972905158997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.5809781551361084, + "epoch": 11.45, + "learning_rate": 6.695906432748537e-06, + "loss": 0.1898, + "step": 4570, + "task_loss": 0.31455641984939575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.31768062710762024, + "epoch": 11.48, + "learning_rate": 6.403508771929824e-06, + "loss": 0.1649, + "step": 4580, + "task_loss": 0.2114783525466919 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.23975571990013123, + "epoch": 11.5, + "learning_rate": 6.11111111111111e-06, + "loss": 0.1643, + "step": 4590, + "task_loss": 0.034987449645996094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.40438854694366455, + "epoch": 11.53, + "learning_rate": 5.818713450292397e-06, + "loss": 0.1531, + "step": 4600, + "task_loss": 0.2620016932487488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.11603149771690369, + "epoch": 11.55, + "learning_rate": 5.526315789473683e-06, + "loss": 0.1272, + "step": 4610, + "task_loss": 0.04970061779022217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.04775264859199524, + "epoch": 11.58, + "learning_rate": 5.23391812865497e-06, + "loss": 0.1616, + "step": 4620, + "task_loss": 0.02132624387741089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.11175966262817383, + "epoch": 11.6, + "learning_rate": 4.941520467836257e-06, + "loss": 0.1758, + "step": 4630, + "task_loss": 0.09407076239585876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.03183834254741669, + "epoch": 11.63, + "learning_rate": 4.649122807017543e-06, + "loss": 0.1642, + "step": 4640, + "task_loss": 0.011670589447021484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.03778667002916336, + "epoch": 11.65, + "learning_rate": 4.35672514619883e-06, + "loss": 0.1842, + "step": 4650, + "task_loss": 0.11324834823608398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.06027041748166084, + "epoch": 11.68, + "learning_rate": 4.064327485380116e-06, + "loss": 0.1553, + "step": 4660, + "task_loss": 0.03356653451919556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.10275101661682129, + "epoch": 11.7, + "learning_rate": 3.771929824561403e-06, + "loss": 0.169, + "step": 4670, + "task_loss": 0.04669731855392456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.10769091546535492, + "epoch": 11.73, + "learning_rate": 3.4795321637426897e-06, + "loss": 0.1337, + "step": 4680, + "task_loss": 0.041588425636291504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.18004803359508514, + "epoch": 11.75, + "learning_rate": 3.187134502923976e-06, + "loss": 0.1598, + "step": 4690, + "task_loss": 0.06004643440246582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.06837992370128632, + "epoch": 11.78, + "learning_rate": 2.894736842105263e-06, + "loss": 0.1706, + "step": 4700, + "task_loss": 0.10142296552658081 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.3725603222846985, + "epoch": 11.8, + "learning_rate": 2.6023391812865493e-06, + "loss": 0.1747, + "step": 4710, + "task_loss": 0.1912306547164917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.24147792160511017, + "epoch": 11.83, + "learning_rate": 2.3099415204678357e-06, + "loss": 0.1578, + "step": 4720, + "task_loss": 0.1674221158027649 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.44135892391204834, + "epoch": 11.85, + "learning_rate": 2.0175438596491226e-06, + "loss": 0.1313, + "step": 4730, + "task_loss": 0.21962109208106995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.2678522765636444, + "epoch": 11.88, + "learning_rate": 1.7251461988304092e-06, + "loss": 0.1521, + "step": 4740, + "task_loss": 0.0885564386844635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.45461341738700867, + "epoch": 11.9, + "learning_rate": 1.4327485380116958e-06, + "loss": 0.1516, + "step": 4750, + "task_loss": 0.17013055086135864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.05051502212882042, + "epoch": 11.93, + "learning_rate": 1.1403508771929824e-06, + "loss": 0.1645, + "step": 4760, + "task_loss": 0.129602313041687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.2683265805244446, + "epoch": 11.95, + "learning_rate": 8.479532163742689e-07, + "loss": 0.1474, + "step": 4770, + "task_loss": 0.14865410327911377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.63749089600271, + "compression/movement_sparsity/model_sparsity": 0.5730886610217056, + "compression_loss": 0.0, + "distillation_loss": 0.37407612800598145, + "epoch": 11.98, + "learning_rate": 5.555555555555555e-07, + "loss": 0.1836, + "step": 4780, + "task_loss": 0.14674681425094604 + }, + { + "epoch": 12.0, + "eval_accuracy": 0.9794057075610474, + "eval_loss": 0.08688130974769592, + "eval_runtime": 89.0371, + "eval_samples_per_second": 76.35, + "eval_steps_per_second": 2.392, + "step": 4788 + }, + { + "epoch": 12.0, + "step": 4788, + "total_flos": 5.579752612756608e+18, + "train_loss": 3.67681538981503, + "train_runtime": 15601.2245, + "train_samples_per_second": 39.3, + "train_steps_per_second": 0.307 + } + ], + "max_steps": 4788, + "num_train_epochs": 12, + "total_flos": 5.579752612756608e+18, + "trial_name": null, + "trial_params": null +}