diff --git "a/checkpoint-1167/trainer_state.json" "b/checkpoint-1167/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1167/trainer_state.json" @@ -0,0 +1,9476 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9770408163265305, + "eval_steps": 59, + "global_step": 1167, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002551020408163265, + "grad_norm": 4.640754222869873, + "learning_rate": 0.0, + "loss": 0.7912, + "step": 1 + }, + { + "epoch": 0.00510204081632653, + "grad_norm": 19.163949966430664, + "learning_rate": 5.649717514124295e-07, + "loss": 3.5781, + "step": 2 + }, + { + "epoch": 0.007653061224489796, + "grad_norm": 4.321211814880371, + "learning_rate": 1.129943502824859e-06, + "loss": 0.8711, + "step": 3 + }, + { + "epoch": 0.01020408163265306, + "grad_norm": 5.854691505432129, + "learning_rate": 1.6949152542372882e-06, + "loss": 0.9923, + "step": 4 + }, + { + "epoch": 0.012755102040816327, + "grad_norm": 2.789024591445923, + "learning_rate": 2.259887005649718e-06, + "loss": 0.6723, + "step": 5 + }, + { + "epoch": 0.015306122448979591, + "grad_norm": 6.3219895362854, + "learning_rate": 2.824858757062147e-06, + "loss": 1.0542, + "step": 6 + }, + { + "epoch": 0.017857142857142856, + "grad_norm": 5.24042272567749, + "learning_rate": 3.3898305084745763e-06, + "loss": 0.8721, + "step": 7 + }, + { + "epoch": 0.02040816326530612, + "grad_norm": 4.587536334991455, + "learning_rate": 3.954802259887006e-06, + "loss": 0.8121, + "step": 8 + }, + { + "epoch": 0.02295918367346939, + "grad_norm": 5.2536725997924805, + "learning_rate": 4.519774011299436e-06, + "loss": 0.9226, + "step": 9 + }, + { + "epoch": 0.025510204081632654, + "grad_norm": 4.832824230194092, + "learning_rate": 5.084745762711865e-06, + "loss": 0.7534, + "step": 10 + }, + { + "epoch": 0.02806122448979592, + "grad_norm": 5.17836856842041, + "learning_rate": 5.649717514124294e-06, + "loss": 0.9769, + "step": 11 + }, + { + "epoch": 0.030612244897959183, + "grad_norm": 6.042638301849365, + "learning_rate": 6.214689265536724e-06, + "loss": 1.1295, + "step": 12 + }, + { + "epoch": 0.03316326530612245, + "grad_norm": 5.177023410797119, + "learning_rate": 6.779661016949153e-06, + "loss": 0.9773, + "step": 13 + }, + { + "epoch": 0.03571428571428571, + "grad_norm": 4.675361156463623, + "learning_rate": 7.3446327683615825e-06, + "loss": 0.7239, + "step": 14 + }, + { + "epoch": 0.03826530612244898, + "grad_norm": 4.548404216766357, + "learning_rate": 7.909604519774012e-06, + "loss": 0.6364, + "step": 15 + }, + { + "epoch": 0.04081632653061224, + "grad_norm": 4.334882736206055, + "learning_rate": 8.47457627118644e-06, + "loss": 0.7573, + "step": 16 + }, + { + "epoch": 0.04336734693877551, + "grad_norm": 4.564873695373535, + "learning_rate": 9.039548022598871e-06, + "loss": 0.7629, + "step": 17 + }, + { + "epoch": 0.04591836734693878, + "grad_norm": 4.975100040435791, + "learning_rate": 9.6045197740113e-06, + "loss": 0.8665, + "step": 18 + }, + { + "epoch": 0.04846938775510204, + "grad_norm": 4.1612019538879395, + "learning_rate": 1.016949152542373e-05, + "loss": 0.6049, + "step": 19 + }, + { + "epoch": 0.05102040816326531, + "grad_norm": 4.456337928771973, + "learning_rate": 1.0734463276836158e-05, + "loss": 0.6587, + "step": 20 + }, + { + "epoch": 0.05357142857142857, + "grad_norm": 3.83097505569458, + "learning_rate": 1.1299435028248587e-05, + "loss": 0.5717, + "step": 21 + }, + { + "epoch": 0.05612244897959184, + "grad_norm": 3.482477903366089, + "learning_rate": 1.1864406779661018e-05, + "loss": 0.4781, + "step": 22 + }, + { + "epoch": 0.058673469387755105, + "grad_norm": 3.8852853775024414, + "learning_rate": 1.2429378531073447e-05, + "loss": 0.4699, + "step": 23 + }, + { + "epoch": 0.061224489795918366, + "grad_norm": 12.518363952636719, + "learning_rate": 1.2994350282485876e-05, + "loss": 1.7145, + "step": 24 + }, + { + "epoch": 0.06377551020408163, + "grad_norm": 3.4650845527648926, + "learning_rate": 1.3559322033898305e-05, + "loss": 0.531, + "step": 25 + }, + { + "epoch": 0.0663265306122449, + "grad_norm": 3.0887956619262695, + "learning_rate": 1.4124293785310736e-05, + "loss": 0.5584, + "step": 26 + }, + { + "epoch": 0.06887755102040816, + "grad_norm": 2.7673590183258057, + "learning_rate": 1.4689265536723165e-05, + "loss": 0.398, + "step": 27 + }, + { + "epoch": 0.07142857142857142, + "grad_norm": 3.6365857124328613, + "learning_rate": 1.5254237288135596e-05, + "loss": 0.5015, + "step": 28 + }, + { + "epoch": 0.07397959183673469, + "grad_norm": 2.849165916442871, + "learning_rate": 1.5819209039548023e-05, + "loss": 0.4741, + "step": 29 + }, + { + "epoch": 0.07653061224489796, + "grad_norm": 2.6374197006225586, + "learning_rate": 1.638418079096045e-05, + "loss": 0.3762, + "step": 30 + }, + { + "epoch": 0.07908163265306123, + "grad_norm": 5.663973331451416, + "learning_rate": 1.694915254237288e-05, + "loss": 0.6952, + "step": 31 + }, + { + "epoch": 0.08163265306122448, + "grad_norm": 2.380173921585083, + "learning_rate": 1.7514124293785312e-05, + "loss": 0.2723, + "step": 32 + }, + { + "epoch": 0.08418367346938775, + "grad_norm": 3.0484137535095215, + "learning_rate": 1.8079096045197743e-05, + "loss": 0.4301, + "step": 33 + }, + { + "epoch": 0.08673469387755102, + "grad_norm": 2.5542681217193604, + "learning_rate": 1.864406779661017e-05, + "loss": 0.3839, + "step": 34 + }, + { + "epoch": 0.08928571428571429, + "grad_norm": 2.6905667781829834, + "learning_rate": 1.92090395480226e-05, + "loss": 0.3154, + "step": 35 + }, + { + "epoch": 0.09183673469387756, + "grad_norm": 2.3147170543670654, + "learning_rate": 1.977401129943503e-05, + "loss": 0.2796, + "step": 36 + }, + { + "epoch": 0.09438775510204081, + "grad_norm": 2.257939338684082, + "learning_rate": 2.033898305084746e-05, + "loss": 0.2964, + "step": 37 + }, + { + "epoch": 0.09693877551020408, + "grad_norm": 1.8513349294662476, + "learning_rate": 2.0903954802259886e-05, + "loss": 0.2232, + "step": 38 + }, + { + "epoch": 0.09948979591836735, + "grad_norm": 2.0890722274780273, + "learning_rate": 2.1468926553672317e-05, + "loss": 0.2661, + "step": 39 + }, + { + "epoch": 0.10204081632653061, + "grad_norm": 2.6349799633026123, + "learning_rate": 2.2033898305084748e-05, + "loss": 0.3133, + "step": 40 + }, + { + "epoch": 0.10459183673469388, + "grad_norm": 1.9986095428466797, + "learning_rate": 2.2598870056497175e-05, + "loss": 0.2047, + "step": 41 + }, + { + "epoch": 0.10714285714285714, + "grad_norm": 1.957235336303711, + "learning_rate": 2.3163841807909606e-05, + "loss": 0.2206, + "step": 42 + }, + { + "epoch": 0.1096938775510204, + "grad_norm": 1.913266897201538, + "learning_rate": 2.3728813559322036e-05, + "loss": 0.1694, + "step": 43 + }, + { + "epoch": 0.11224489795918367, + "grad_norm": 1.6499242782592773, + "learning_rate": 2.4293785310734467e-05, + "loss": 0.1864, + "step": 44 + }, + { + "epoch": 0.11479591836734694, + "grad_norm": 2.0955774784088135, + "learning_rate": 2.4858757062146894e-05, + "loss": 0.2126, + "step": 45 + }, + { + "epoch": 0.11734693877551021, + "grad_norm": 1.676824688911438, + "learning_rate": 2.5423728813559322e-05, + "loss": 0.1589, + "step": 46 + }, + { + "epoch": 0.11989795918367346, + "grad_norm": 2.3270509243011475, + "learning_rate": 2.5988700564971752e-05, + "loss": 0.2539, + "step": 47 + }, + { + "epoch": 0.12244897959183673, + "grad_norm": 2.5719568729400635, + "learning_rate": 2.6553672316384183e-05, + "loss": 0.2403, + "step": 48 + }, + { + "epoch": 0.125, + "grad_norm": 1.9103913307189941, + "learning_rate": 2.711864406779661e-05, + "loss": 0.1666, + "step": 49 + }, + { + "epoch": 0.12755102040816327, + "grad_norm": 1.6335673332214355, + "learning_rate": 2.768361581920904e-05, + "loss": 0.1633, + "step": 50 + }, + { + "epoch": 0.13010204081632654, + "grad_norm": 2.256082057952881, + "learning_rate": 2.8248587570621472e-05, + "loss": 0.2204, + "step": 51 + }, + { + "epoch": 0.1326530612244898, + "grad_norm": 1.4262380599975586, + "learning_rate": 2.88135593220339e-05, + "loss": 0.0716, + "step": 52 + }, + { + "epoch": 0.13520408163265307, + "grad_norm": 1.700405478477478, + "learning_rate": 2.937853107344633e-05, + "loss": 0.1254, + "step": 53 + }, + { + "epoch": 0.1377551020408163, + "grad_norm": 2.738555669784546, + "learning_rate": 2.994350282485876e-05, + "loss": 0.3478, + "step": 54 + }, + { + "epoch": 0.14030612244897958, + "grad_norm": 2.329765558242798, + "learning_rate": 3.050847457627119e-05, + "loss": 0.2607, + "step": 55 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 2.302366256713867, + "learning_rate": 3.107344632768362e-05, + "loss": 0.2158, + "step": 56 + }, + { + "epoch": 0.14540816326530612, + "grad_norm": 1.8881169557571411, + "learning_rate": 3.1638418079096046e-05, + "loss": 0.2082, + "step": 57 + }, + { + "epoch": 0.14795918367346939, + "grad_norm": 2.457533359527588, + "learning_rate": 3.2203389830508473e-05, + "loss": 0.2334, + "step": 58 + }, + { + "epoch": 0.15051020408163265, + "grad_norm": 2.476409435272217, + "learning_rate": 3.27683615819209e-05, + "loss": 0.2203, + "step": 59 + }, + { + "epoch": 0.15051020408163265, + "eval_NLI_loss": 0.9446932673454285, + "eval_NLI_runtime": 7.1853, + "eval_NLI_samples_per_second": 11.83, + "eval_NLI_steps_per_second": 0.139, + "eval_Qnli-dev_cosine_accuracy": 0.7265625, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.6843721270561218, + "eval_Qnli-dev_cosine_ap": 0.7343480130942729, + "eval_Qnli-dev_cosine_f1": 0.6956521739130435, + "eval_Qnli-dev_cosine_f1_threshold": 0.6574362516403198, + "eval_Qnli-dev_cosine_mcc": 0.44823424273450235, + "eval_Qnli-dev_cosine_precision": 0.7142857142857143, + "eval_Qnli-dev_cosine_recall": 0.6779661016949152, + "eval_allNLI-dev_cosine_accuracy": 0.765625, + "eval_allNLI-dev_cosine_accuracy_threshold": 0.7472490072250366, + "eval_allNLI-dev_cosine_ap": 0.6640883745760782, + "eval_allNLI-dev_cosine_f1": 0.7058823529411764, + "eval_allNLI-dev_cosine_f1_threshold": 0.6876716017723083, + "eval_allNLI-dev_cosine_mcc": 0.5368906701581015, + "eval_allNLI-dev_cosine_precision": 0.6101694915254238, + "eval_allNLI-dev_cosine_recall": 0.8372093023255814, + "eval_sequential_score": 0.7343480130942729, + "eval_sts-test_pearson_cosine": 0.9002300823689351, + "eval_sts-test_spearman_cosine": 0.9128881860762427, + "step": 59 + }, + { + "epoch": 0.15051020408163265, + "eval_natural-questions_loss": 0.21671055257320404, + "eval_natural-questions_runtime": 42.1549, + "eval_natural-questions_samples_per_second": 2.681, + "eval_natural-questions_steps_per_second": 0.024, + "step": 59 + }, + { + "epoch": 0.15051020408163265, + "eval_vitaminc_loss": 2.4174671173095703, + "eval_vitaminc_runtime": 1.5099, + "eval_vitaminc_samples_per_second": 74.84, + "eval_vitaminc_steps_per_second": 0.662, + "step": 59 + }, + { + "epoch": 0.15051020408163265, + "eval_xsum_loss": 0.17104597389698029, + "eval_xsum_runtime": 7.5958, + "eval_xsum_samples_per_second": 14.877, + "eval_xsum_steps_per_second": 0.132, + "step": 59 + }, + { + "epoch": 0.15051020408163265, + "eval_paws_loss": 0.020362937822937965, + "eval_paws_runtime": 1.3746, + "eval_paws_samples_per_second": 82.206, + "eval_paws_steps_per_second": 0.727, + "step": 59 + }, + { + "epoch": 0.15051020408163265, + "eval_global_dataset_loss": 0.2823931872844696, + "eval_global_dataset_runtime": 15.8638, + "eval_global_dataset_samples_per_second": 16.137, + "eval_global_dataset_steps_per_second": 0.063, + "step": 59 + }, + { + "epoch": 0.15306122448979592, + "grad_norm": 1.9292579889297485, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.1368, + "step": 60 + }, + { + "epoch": 0.1556122448979592, + "grad_norm": 2.376699447631836, + "learning_rate": 3.389830508474576e-05, + "loss": 0.2153, + "step": 61 + }, + { + "epoch": 0.15816326530612246, + "grad_norm": 1.6167395114898682, + "learning_rate": 3.446327683615819e-05, + "loss": 0.0711, + "step": 62 + }, + { + "epoch": 0.16071428571428573, + "grad_norm": 2.245579242706299, + "learning_rate": 3.5028248587570624e-05, + "loss": 0.2255, + "step": 63 + }, + { + "epoch": 0.16326530612244897, + "grad_norm": 1.7884480953216553, + "learning_rate": 3.559322033898305e-05, + "loss": 0.0982, + "step": 64 + }, + { + "epoch": 0.16581632653061223, + "grad_norm": 2.0018584728240967, + "learning_rate": 3.6158192090395485e-05, + "loss": 0.1388, + "step": 65 + }, + { + "epoch": 0.1683673469387755, + "grad_norm": 2.278801918029785, + "learning_rate": 3.672316384180791e-05, + "loss": 0.1797, + "step": 66 + }, + { + "epoch": 0.17091836734693877, + "grad_norm": 4.3720927238464355, + "learning_rate": 3.728813559322034e-05, + "loss": 0.4173, + "step": 67 + }, + { + "epoch": 0.17346938775510204, + "grad_norm": 0.10999222844839096, + "learning_rate": 3.7853107344632774e-05, + "loss": 0.0102, + "step": 68 + }, + { + "epoch": 0.1760204081632653, + "grad_norm": 0.9621866345405579, + "learning_rate": 3.84180790960452e-05, + "loss": 0.0634, + "step": 69 + }, + { + "epoch": 0.17857142857142858, + "grad_norm": 2.172738552093506, + "learning_rate": 3.898305084745763e-05, + "loss": 0.1956, + "step": 70 + }, + { + "epoch": 0.18112244897959184, + "grad_norm": 2.5349419116973877, + "learning_rate": 3.954802259887006e-05, + "loss": 0.2188, + "step": 71 + }, + { + "epoch": 0.1836734693877551, + "grad_norm": 1.8658069372177124, + "learning_rate": 4.011299435028249e-05, + "loss": 0.1399, + "step": 72 + }, + { + "epoch": 0.18622448979591838, + "grad_norm": 1.95505690574646, + "learning_rate": 4.067796610169492e-05, + "loss": 0.1489, + "step": 73 + }, + { + "epoch": 0.18877551020408162, + "grad_norm": 2.0226597785949707, + "learning_rate": 4.1242937853107345e-05, + "loss": 0.1567, + "step": 74 + }, + { + "epoch": 0.1913265306122449, + "grad_norm": 2.3184401988983154, + "learning_rate": 4.180790960451977e-05, + "loss": 0.2404, + "step": 75 + }, + { + "epoch": 0.19387755102040816, + "grad_norm": 1.8563607931137085, + "learning_rate": 4.2372881355932206e-05, + "loss": 0.1295, + "step": 76 + }, + { + "epoch": 0.19642857142857142, + "grad_norm": 3.6983888149261475, + "learning_rate": 4.2937853107344634e-05, + "loss": 0.4541, + "step": 77 + }, + { + "epoch": 0.1989795918367347, + "grad_norm": 2.6192500591278076, + "learning_rate": 4.350282485875706e-05, + "loss": 0.2364, + "step": 78 + }, + { + "epoch": 0.20153061224489796, + "grad_norm": 1.3945753574371338, + "learning_rate": 4.4067796610169495e-05, + "loss": 0.0929, + "step": 79 + }, + { + "epoch": 0.20408163265306123, + "grad_norm": 2.1527037620544434, + "learning_rate": 4.463276836158192e-05, + "loss": 0.1699, + "step": 80 + }, + { + "epoch": 0.2066326530612245, + "grad_norm": 2.0259571075439453, + "learning_rate": 4.519774011299435e-05, + "loss": 0.1846, + "step": 81 + }, + { + "epoch": 0.20918367346938777, + "grad_norm": 1.680916428565979, + "learning_rate": 4.5762711864406784e-05, + "loss": 0.1126, + "step": 82 + }, + { + "epoch": 0.21173469387755103, + "grad_norm": 1.6830638647079468, + "learning_rate": 4.632768361581921e-05, + "loss": 0.1151, + "step": 83 + }, + { + "epoch": 0.21428571428571427, + "grad_norm": 2.2057628631591797, + "learning_rate": 4.689265536723164e-05, + "loss": 0.2015, + "step": 84 + }, + { + "epoch": 0.21683673469387754, + "grad_norm": 1.639238953590393, + "learning_rate": 4.745762711864407e-05, + "loss": 0.1028, + "step": 85 + }, + { + "epoch": 0.2193877551020408, + "grad_norm": 2.395477771759033, + "learning_rate": 4.80225988700565e-05, + "loss": 0.2284, + "step": 86 + }, + { + "epoch": 0.22193877551020408, + "grad_norm": 1.587292194366455, + "learning_rate": 4.8587570621468934e-05, + "loss": 0.1368, + "step": 87 + }, + { + "epoch": 0.22448979591836735, + "grad_norm": 1.606228232383728, + "learning_rate": 4.915254237288136e-05, + "loss": 0.0836, + "step": 88 + }, + { + "epoch": 0.22704081632653061, + "grad_norm": 1.4456911087036133, + "learning_rate": 4.971751412429379e-05, + "loss": 0.1276, + "step": 89 + }, + { + "epoch": 0.22959183673469388, + "grad_norm": 1.9870944023132324, + "learning_rate": 5.028248587570622e-05, + "loss": 0.181, + "step": 90 + }, + { + "epoch": 0.23214285714285715, + "grad_norm": 1.7185001373291016, + "learning_rate": 5.0847457627118643e-05, + "loss": 0.1516, + "step": 91 + }, + { + "epoch": 0.23469387755102042, + "grad_norm": 2.459712266921997, + "learning_rate": 5.141242937853108e-05, + "loss": 0.1769, + "step": 92 + }, + { + "epoch": 0.2372448979591837, + "grad_norm": 2.0864956378936768, + "learning_rate": 5.1977401129943505e-05, + "loss": 0.1261, + "step": 93 + }, + { + "epoch": 0.23979591836734693, + "grad_norm": 2.35571551322937, + "learning_rate": 5.254237288135594e-05, + "loss": 0.2324, + "step": 94 + }, + { + "epoch": 0.2423469387755102, + "grad_norm": 1.7409874200820923, + "learning_rate": 5.3107344632768366e-05, + "loss": 0.1046, + "step": 95 + }, + { + "epoch": 0.24489795918367346, + "grad_norm": 1.9755069017410278, + "learning_rate": 5.36723163841808e-05, + "loss": 0.1372, + "step": 96 + }, + { + "epoch": 0.24744897959183673, + "grad_norm": 1.3746156692504883, + "learning_rate": 5.423728813559322e-05, + "loss": 0.0654, + "step": 97 + }, + { + "epoch": 0.25, + "grad_norm": 2.2815542221069336, + "learning_rate": 5.480225988700565e-05, + "loss": 0.2279, + "step": 98 + }, + { + "epoch": 0.25255102040816324, + "grad_norm": 1.522020697593689, + "learning_rate": 5.536723163841808e-05, + "loss": 0.0807, + "step": 99 + }, + { + "epoch": 0.25510204081632654, + "grad_norm": 1.7788827419281006, + "learning_rate": 5.593220338983051e-05, + "loss": 0.123, + "step": 100 + }, + { + "epoch": 0.2576530612244898, + "grad_norm": 1.8609931468963623, + "learning_rate": 5.6497175141242944e-05, + "loss": 0.1464, + "step": 101 + }, + { + "epoch": 0.2602040816326531, + "grad_norm": 1.3817753791809082, + "learning_rate": 5.7062146892655364e-05, + "loss": 0.0897, + "step": 102 + }, + { + "epoch": 0.2627551020408163, + "grad_norm": 1.866331696510315, + "learning_rate": 5.76271186440678e-05, + "loss": 0.1612, + "step": 103 + }, + { + "epoch": 0.2653061224489796, + "grad_norm": 1.8496090173721313, + "learning_rate": 5.8192090395480226e-05, + "loss": 0.1289, + "step": 104 + }, + { + "epoch": 0.26785714285714285, + "grad_norm": 3.13761043548584, + "learning_rate": 5.875706214689266e-05, + "loss": 0.7234, + "step": 105 + }, + { + "epoch": 0.27040816326530615, + "grad_norm": 1.9749125242233276, + "learning_rate": 5.932203389830509e-05, + "loss": 0.1004, + "step": 106 + }, + { + "epoch": 0.2729591836734694, + "grad_norm": 1.7557865381240845, + "learning_rate": 5.988700564971752e-05, + "loss": 0.1227, + "step": 107 + }, + { + "epoch": 0.2755102040816326, + "grad_norm": 2.5279057025909424, + "learning_rate": 6.045197740112994e-05, + "loss": 0.2446, + "step": 108 + }, + { + "epoch": 0.2780612244897959, + "grad_norm": 2.323599100112915, + "learning_rate": 6.101694915254238e-05, + "loss": 0.1338, + "step": 109 + }, + { + "epoch": 0.28061224489795916, + "grad_norm": 1.2157937288284302, + "learning_rate": 6.158192090395481e-05, + "loss": 0.0427, + "step": 110 + }, + { + "epoch": 0.28316326530612246, + "grad_norm": 2.0103859901428223, + "learning_rate": 6.214689265536724e-05, + "loss": 0.1149, + "step": 111 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 1.8882136344909668, + "learning_rate": 6.271186440677966e-05, + "loss": 0.1524, + "step": 112 + }, + { + "epoch": 0.288265306122449, + "grad_norm": 1.8960480690002441, + "learning_rate": 6.327683615819209e-05, + "loss": 0.1308, + "step": 113 + }, + { + "epoch": 0.29081632653061223, + "grad_norm": 2.4462227821350098, + "learning_rate": 6.384180790960452e-05, + "loss": 0.192, + "step": 114 + }, + { + "epoch": 0.29336734693877553, + "grad_norm": 2.042628288269043, + "learning_rate": 6.440677966101695e-05, + "loss": 0.141, + "step": 115 + }, + { + "epoch": 0.29591836734693877, + "grad_norm": 2.021028757095337, + "learning_rate": 6.497175141242939e-05, + "loss": 0.1539, + "step": 116 + }, + { + "epoch": 0.29846938775510207, + "grad_norm": 1.8326799869537354, + "learning_rate": 6.55367231638418e-05, + "loss": 0.1548, + "step": 117 + }, + { + "epoch": 0.3010204081632653, + "grad_norm": 2.1327924728393555, + "learning_rate": 6.610169491525424e-05, + "loss": 0.1284, + "step": 118 + }, + { + "epoch": 0.3010204081632653, + "eval_NLI_loss": 0.8681649565696716, + "eval_NLI_runtime": 7.1447, + "eval_NLI_samples_per_second": 11.897, + "eval_NLI_steps_per_second": 0.14, + "eval_Qnli-dev_cosine_accuracy": 0.7421875, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.6371709108352661, + "eval_Qnli-dev_cosine_ap": 0.7364484045530271, + "eval_Qnli-dev_cosine_f1": 0.7226890756302521, + "eval_Qnli-dev_cosine_f1_threshold": 0.6371709108352661, + "eval_Qnli-dev_cosine_mcc": 0.4819043546124848, + "eval_Qnli-dev_cosine_precision": 0.7166666666666667, + "eval_Qnli-dev_cosine_recall": 0.7288135593220338, + "eval_allNLI-dev_cosine_accuracy": 0.7578125, + "eval_allNLI-dev_cosine_accuracy_threshold": 0.6858762502670288, + "eval_allNLI-dev_cosine_ap": 0.6650692878242668, + "eval_allNLI-dev_cosine_f1": 0.7037037037037038, + "eval_allNLI-dev_cosine_f1_threshold": 0.6323522329330444, + "eval_allNLI-dev_cosine_mcc": 0.5347980893043971, + "eval_allNLI-dev_cosine_precision": 0.5846153846153846, + "eval_allNLI-dev_cosine_recall": 0.8837209302325582, + "eval_sequential_score": 0.7364484045530271, + "eval_sts-test_pearson_cosine": 0.9044351509469432, + "eval_sts-test_spearman_cosine": 0.9150873525635388, + "step": 118 + }, + { + "epoch": 0.3010204081632653, + "eval_natural-questions_loss": 0.13882923126220703, + "eval_natural-questions_runtime": 42.1317, + "eval_natural-questions_samples_per_second": 2.682, + "eval_natural-questions_steps_per_second": 0.024, + "step": 118 + }, + { + "epoch": 0.3010204081632653, + "eval_vitaminc_loss": 2.3304193019866943, + "eval_vitaminc_runtime": 1.5158, + "eval_vitaminc_samples_per_second": 74.546, + "eval_vitaminc_steps_per_second": 0.66, + "step": 118 + }, + { + "epoch": 0.3010204081632653, + "eval_xsum_loss": 0.10622090846300125, + "eval_xsum_runtime": 7.5931, + "eval_xsum_samples_per_second": 14.882, + "eval_xsum_steps_per_second": 0.132, + "step": 118 + }, + { + "epoch": 0.3010204081632653, + "eval_paws_loss": 0.019996974617242813, + "eval_paws_runtime": 1.3717, + "eval_paws_samples_per_second": 82.378, + "eval_paws_steps_per_second": 0.729, + "step": 118 + }, + { + "epoch": 0.3010204081632653, + "eval_global_dataset_loss": 0.26942023634910583, + "eval_global_dataset_runtime": 15.836, + "eval_global_dataset_samples_per_second": 16.166, + "eval_global_dataset_steps_per_second": 0.063, + "step": 118 + }, + { + "epoch": 0.30357142857142855, + "grad_norm": 1.6810752153396606, + "learning_rate": 6.666666666666667e-05, + "loss": 0.0939, + "step": 119 + }, + { + "epoch": 0.30612244897959184, + "grad_norm": 2.3066258430480957, + "learning_rate": 6.72316384180791e-05, + "loss": 0.2675, + "step": 120 + }, + { + "epoch": 0.3086734693877551, + "grad_norm": 1.9492994546890259, + "learning_rate": 6.779661016949152e-05, + "loss": 0.1542, + "step": 121 + }, + { + "epoch": 0.3112244897959184, + "grad_norm": 1.8484933376312256, + "learning_rate": 6.836158192090397e-05, + "loss": 0.1347, + "step": 122 + }, + { + "epoch": 0.3137755102040816, + "grad_norm": 1.802600622177124, + "learning_rate": 6.892655367231638e-05, + "loss": 0.1285, + "step": 123 + }, + { + "epoch": 0.3163265306122449, + "grad_norm": 1.8063158988952637, + "learning_rate": 6.949152542372882e-05, + "loss": 0.1025, + "step": 124 + }, + { + "epoch": 0.31887755102040816, + "grad_norm": 1.6940953731536865, + "learning_rate": 7.005649717514125e-05, + "loss": 0.0879, + "step": 125 + }, + { + "epoch": 0.32142857142857145, + "grad_norm": 1.059670329093933, + "learning_rate": 7.062146892655367e-05, + "loss": 0.0446, + "step": 126 + }, + { + "epoch": 0.3239795918367347, + "grad_norm": 2.1733789443969727, + "learning_rate": 7.11864406779661e-05, + "loss": 0.1739, + "step": 127 + }, + { + "epoch": 0.32653061224489793, + "grad_norm": 1.7153602838516235, + "learning_rate": 7.175141242937854e-05, + "loss": 0.1309, + "step": 128 + }, + { + "epoch": 0.32908163265306123, + "grad_norm": 2.2577075958251953, + "learning_rate": 7.231638418079097e-05, + "loss": 0.1737, + "step": 129 + }, + { + "epoch": 0.33163265306122447, + "grad_norm": 1.3602570295333862, + "learning_rate": 7.288135593220338e-05, + "loss": 0.1063, + "step": 130 + }, + { + "epoch": 0.33418367346938777, + "grad_norm": 0.31389689445495605, + "learning_rate": 7.344632768361583e-05, + "loss": 0.0568, + "step": 131 + }, + { + "epoch": 0.336734693877551, + "grad_norm": 2.102017402648926, + "learning_rate": 7.401129943502825e-05, + "loss": 0.1966, + "step": 132 + }, + { + "epoch": 0.3392857142857143, + "grad_norm": 2.4718096256256104, + "learning_rate": 7.457627118644068e-05, + "loss": 0.2336, + "step": 133 + }, + { + "epoch": 0.34183673469387754, + "grad_norm": 2.1702322959899902, + "learning_rate": 7.514124293785311e-05, + "loss": 0.1716, + "step": 134 + }, + { + "epoch": 0.34438775510204084, + "grad_norm": 1.5582847595214844, + "learning_rate": 7.570621468926555e-05, + "loss": 0.0979, + "step": 135 + }, + { + "epoch": 0.3469387755102041, + "grad_norm": 1.811691164970398, + "learning_rate": 7.627118644067796e-05, + "loss": 0.1319, + "step": 136 + }, + { + "epoch": 0.3494897959183674, + "grad_norm": 1.4586181640625, + "learning_rate": 7.68361581920904e-05, + "loss": 0.1058, + "step": 137 + }, + { + "epoch": 0.3520408163265306, + "grad_norm": 2.367692470550537, + "learning_rate": 7.740112994350283e-05, + "loss": 0.225, + "step": 138 + }, + { + "epoch": 0.35459183673469385, + "grad_norm": 1.6998730897903442, + "learning_rate": 7.796610169491526e-05, + "loss": 0.1045, + "step": 139 + }, + { + "epoch": 0.35714285714285715, + "grad_norm": 1.6210442781448364, + "learning_rate": 7.853107344632768e-05, + "loss": 0.1066, + "step": 140 + }, + { + "epoch": 0.3596938775510204, + "grad_norm": 1.8243966102600098, + "learning_rate": 7.909604519774013e-05, + "loss": 0.1234, + "step": 141 + }, + { + "epoch": 0.3622448979591837, + "grad_norm": 2.078484058380127, + "learning_rate": 7.966101694915254e-05, + "loss": 0.1707, + "step": 142 + }, + { + "epoch": 0.3647959183673469, + "grad_norm": 1.5675996541976929, + "learning_rate": 8.022598870056498e-05, + "loss": 0.1204, + "step": 143 + }, + { + "epoch": 0.3673469387755102, + "grad_norm": 2.6848459243774414, + "learning_rate": 8.079096045197741e-05, + "loss": 0.2086, + "step": 144 + }, + { + "epoch": 0.36989795918367346, + "grad_norm": 1.7380269765853882, + "learning_rate": 8.135593220338983e-05, + "loss": 0.0982, + "step": 145 + }, + { + "epoch": 0.37244897959183676, + "grad_norm": 1.4549691677093506, + "learning_rate": 8.192090395480226e-05, + "loss": 0.0937, + "step": 146 + }, + { + "epoch": 0.375, + "grad_norm": 2.3912200927734375, + "learning_rate": 8.248587570621469e-05, + "loss": 0.1763, + "step": 147 + }, + { + "epoch": 0.37755102040816324, + "grad_norm": 1.4187450408935547, + "learning_rate": 8.305084745762712e-05, + "loss": 0.0601, + "step": 148 + }, + { + "epoch": 0.38010204081632654, + "grad_norm": 1.9366487264633179, + "learning_rate": 8.361581920903954e-05, + "loss": 0.1354, + "step": 149 + }, + { + "epoch": 0.3826530612244898, + "grad_norm": 1.6401911973953247, + "learning_rate": 8.418079096045199e-05, + "loss": 0.1135, + "step": 150 + }, + { + "epoch": 0.3852040816326531, + "grad_norm": 2.1863253116607666, + "learning_rate": 8.474576271186441e-05, + "loss": 0.2146, + "step": 151 + }, + { + "epoch": 0.3877551020408163, + "grad_norm": 1.718955159187317, + "learning_rate": 8.531073446327684e-05, + "loss": 0.0868, + "step": 152 + }, + { + "epoch": 0.3903061224489796, + "grad_norm": 2.850691795349121, + "learning_rate": 8.587570621468927e-05, + "loss": 0.2428, + "step": 153 + }, + { + "epoch": 0.39285714285714285, + "grad_norm": 1.264520287513733, + "learning_rate": 8.644067796610171e-05, + "loss": 0.0582, + "step": 154 + }, + { + "epoch": 0.39540816326530615, + "grad_norm": 1.64748215675354, + "learning_rate": 8.700564971751412e-05, + "loss": 0.1299, + "step": 155 + }, + { + "epoch": 0.3979591836734694, + "grad_norm": 1.7413716316223145, + "learning_rate": 8.757062146892656e-05, + "loss": 0.0911, + "step": 156 + }, + { + "epoch": 0.4005102040816326, + "grad_norm": 1.8686145544052124, + "learning_rate": 8.813559322033899e-05, + "loss": 0.1184, + "step": 157 + }, + { + "epoch": 0.4030612244897959, + "grad_norm": 1.3806580305099487, + "learning_rate": 8.870056497175142e-05, + "loss": 0.0692, + "step": 158 + }, + { + "epoch": 0.40561224489795916, + "grad_norm": 1.8071966171264648, + "learning_rate": 8.926553672316384e-05, + "loss": 0.1228, + "step": 159 + }, + { + "epoch": 0.40816326530612246, + "grad_norm": 1.3450262546539307, + "learning_rate": 8.983050847457629e-05, + "loss": 0.0574, + "step": 160 + }, + { + "epoch": 0.4107142857142857, + "grad_norm": 1.3434524536132812, + "learning_rate": 9.03954802259887e-05, + "loss": 0.0822, + "step": 161 + }, + { + "epoch": 0.413265306122449, + "grad_norm": 1.8820223808288574, + "learning_rate": 9.096045197740113e-05, + "loss": 0.1071, + "step": 162 + }, + { + "epoch": 0.41581632653061223, + "grad_norm": 1.0908993482589722, + "learning_rate": 9.152542372881357e-05, + "loss": 0.0544, + "step": 163 + }, + { + "epoch": 0.41836734693877553, + "grad_norm": 1.8816108703613281, + "learning_rate": 9.2090395480226e-05, + "loss": 0.1261, + "step": 164 + }, + { + "epoch": 0.42091836734693877, + "grad_norm": 1.7035259008407593, + "learning_rate": 9.265536723163842e-05, + "loss": 0.094, + "step": 165 + }, + { + "epoch": 0.42346938775510207, + "grad_norm": 2.307813882827759, + "learning_rate": 9.322033898305085e-05, + "loss": 0.1539, + "step": 166 + }, + { + "epoch": 0.4260204081632653, + "grad_norm": 1.13148832321167, + "learning_rate": 9.378531073446328e-05, + "loss": 0.045, + "step": 167 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 2.0030624866485596, + "learning_rate": 9.43502824858757e-05, + "loss": 0.1074, + "step": 168 + }, + { + "epoch": 0.43112244897959184, + "grad_norm": 2.3511245250701904, + "learning_rate": 9.491525423728815e-05, + "loss": 0.1626, + "step": 169 + }, + { + "epoch": 0.4336734693877551, + "grad_norm": 1.8808695077896118, + "learning_rate": 9.548022598870057e-05, + "loss": 0.1337, + "step": 170 + }, + { + "epoch": 0.4362244897959184, + "grad_norm": 2.318882703781128, + "learning_rate": 9.6045197740113e-05, + "loss": 0.1737, + "step": 171 + }, + { + "epoch": 0.4387755102040816, + "grad_norm": 1.7159000635147095, + "learning_rate": 9.661016949152543e-05, + "loss": 0.104, + "step": 172 + }, + { + "epoch": 0.4413265306122449, + "grad_norm": 1.802137017250061, + "learning_rate": 9.717514124293787e-05, + "loss": 0.0989, + "step": 173 + }, + { + "epoch": 0.44387755102040816, + "grad_norm": 2.6452414989471436, + "learning_rate": 9.774011299435028e-05, + "loss": 0.2015, + "step": 174 + }, + { + "epoch": 0.44642857142857145, + "grad_norm": 1.916659951210022, + "learning_rate": 9.830508474576272e-05, + "loss": 0.1364, + "step": 175 + }, + { + "epoch": 0.4489795918367347, + "grad_norm": 1.8515814542770386, + "learning_rate": 9.887005649717515e-05, + "loss": 0.0968, + "step": 176 + }, + { + "epoch": 0.45153061224489793, + "grad_norm": 1.9390264749526978, + "learning_rate": 9.943502824858758e-05, + "loss": 0.0868, + "step": 177 + }, + { + "epoch": 0.45153061224489793, + "eval_NLI_loss": 0.8198293447494507, + "eval_NLI_runtime": 7.1757, + "eval_NLI_samples_per_second": 11.845, + "eval_NLI_steps_per_second": 0.139, + "eval_Qnli-dev_cosine_accuracy": 0.7265625, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.6064693927764893, + "eval_Qnli-dev_cosine_ap": 0.7384081520364875, + "eval_Qnli-dev_cosine_f1": 0.7154471544715446, + "eval_Qnli-dev_cosine_f1_threshold": 0.6064693927764893, + "eval_Qnli-dev_cosine_mcc": 0.45451419021127865, + "eval_Qnli-dev_cosine_precision": 0.6875, + "eval_Qnli-dev_cosine_recall": 0.7457627118644068, + "eval_allNLI-dev_cosine_accuracy": 0.765625, + "eval_allNLI-dev_cosine_accuracy_threshold": 0.677872896194458, + "eval_allNLI-dev_cosine_ap": 0.6675617855438021, + "eval_allNLI-dev_cosine_f1": 0.6972477064220183, + "eval_allNLI-dev_cosine_f1_threshold": 0.6217950582504272, + "eval_allNLI-dev_cosine_mcc": 0.5238753184730511, + "eval_allNLI-dev_cosine_precision": 0.5757575757575758, + "eval_allNLI-dev_cosine_recall": 0.8837209302325582, + "eval_sequential_score": 0.7384081520364875, + "eval_sts-test_pearson_cosine": 0.9053834511159466, + "eval_sts-test_spearman_cosine": 0.9166033910456952, + "step": 177 + }, + { + "epoch": 0.45153061224489793, + "eval_natural-questions_loss": 0.09840350598096848, + "eval_natural-questions_runtime": 42.1503, + "eval_natural-questions_samples_per_second": 2.681, + "eval_natural-questions_steps_per_second": 0.024, + "step": 177 + }, + { + "epoch": 0.45153061224489793, + "eval_vitaminc_loss": 2.3936150074005127, + "eval_vitaminc_runtime": 1.5107, + "eval_vitaminc_samples_per_second": 74.798, + "eval_vitaminc_steps_per_second": 0.662, + "step": 177 + }, + { + "epoch": 0.45153061224489793, + "eval_xsum_loss": 0.08041348308324814, + "eval_xsum_runtime": 7.5979, + "eval_xsum_samples_per_second": 14.872, + "eval_xsum_steps_per_second": 0.132, + "step": 177 + }, + { + "epoch": 0.45153061224489793, + "eval_paws_loss": 0.02037755399942398, + "eval_paws_runtime": 1.3737, + "eval_paws_samples_per_second": 82.259, + "eval_paws_steps_per_second": 0.728, + "step": 177 + }, + { + "epoch": 0.45153061224489793, + "eval_global_dataset_loss": 0.27296456694602966, + "eval_global_dataset_runtime": 15.8335, + "eval_global_dataset_samples_per_second": 16.168, + "eval_global_dataset_steps_per_second": 0.063, + "step": 177 + }, + { + "epoch": 0.45408163265306123, + "grad_norm": 1.5879788398742676, + "learning_rate": 0.0001, + "loss": 0.0538, + "step": 178 + }, + { + "epoch": 0.45663265306122447, + "grad_norm": 1.7767413854599, + "learning_rate": 9.99998351772482e-05, + "loss": 0.0855, + "step": 179 + }, + { + "epoch": 0.45918367346938777, + "grad_norm": 1.7397089004516602, + "learning_rate": 9.999934071062278e-05, + "loss": 0.1492, + "step": 180 + }, + { + "epoch": 0.461734693877551, + "grad_norm": 1.6828938722610474, + "learning_rate": 9.999851660501372e-05, + "loss": 0.0799, + "step": 181 + }, + { + "epoch": 0.4642857142857143, + "grad_norm": 1.880875825881958, + "learning_rate": 9.999736286857087e-05, + "loss": 0.0979, + "step": 182 + }, + { + "epoch": 0.46683673469387754, + "grad_norm": 1.719166874885559, + "learning_rate": 9.999587951270395e-05, + "loss": 0.087, + "step": 183 + }, + { + "epoch": 0.46938775510204084, + "grad_norm": 2.0441653728485107, + "learning_rate": 9.999406655208245e-05, + "loss": 0.1763, + "step": 184 + }, + { + "epoch": 0.4719387755102041, + "grad_norm": 2.1032490730285645, + "learning_rate": 9.999192400463538e-05, + "loss": 0.1646, + "step": 185 + }, + { + "epoch": 0.4744897959183674, + "grad_norm": 1.6751331090927124, + "learning_rate": 9.998945189155114e-05, + "loss": 0.1483, + "step": 186 + }, + { + "epoch": 0.4770408163265306, + "grad_norm": 2.058666944503784, + "learning_rate": 9.998665023727741e-05, + "loss": 0.1098, + "step": 187 + }, + { + "epoch": 0.47959183673469385, + "grad_norm": 3.390571117401123, + "learning_rate": 9.998351906952073e-05, + "loss": 0.6778, + "step": 188 + }, + { + "epoch": 0.48214285714285715, + "grad_norm": 1.7647160291671753, + "learning_rate": 9.998005841924638e-05, + "loss": 0.116, + "step": 189 + }, + { + "epoch": 0.4846938775510204, + "grad_norm": 2.1248953342437744, + "learning_rate": 9.9976268320678e-05, + "loss": 0.1465, + "step": 190 + }, + { + "epoch": 0.4872448979591837, + "grad_norm": 1.8044167757034302, + "learning_rate": 9.997214881129728e-05, + "loss": 0.1113, + "step": 191 + }, + { + "epoch": 0.4897959183673469, + "grad_norm": 1.9576975107192993, + "learning_rate": 9.99676999318435e-05, + "loss": 0.1467, + "step": 192 + }, + { + "epoch": 0.4923469387755102, + "grad_norm": 1.0994269847869873, + "learning_rate": 9.996292172631328e-05, + "loss": 0.0744, + "step": 193 + }, + { + "epoch": 0.49489795918367346, + "grad_norm": 1.6973727941513062, + "learning_rate": 9.995781424196002e-05, + "loss": 0.1342, + "step": 194 + }, + { + "epoch": 0.49744897959183676, + "grad_norm": 1.836393117904663, + "learning_rate": 9.995237752929353e-05, + "loss": 0.0979, + "step": 195 + }, + { + "epoch": 0.5, + "grad_norm": 2.1066081523895264, + "learning_rate": 9.994661164207945e-05, + "loss": 0.1969, + "step": 196 + }, + { + "epoch": 0.5025510204081632, + "grad_norm": 2.3128280639648438, + "learning_rate": 9.99405166373387e-05, + "loss": 0.1349, + "step": 197 + }, + { + "epoch": 0.5051020408163265, + "grad_norm": 1.9174576997756958, + "learning_rate": 9.993409257534706e-05, + "loss": 0.1122, + "step": 198 + }, + { + "epoch": 0.5076530612244898, + "grad_norm": 1.647660493850708, + "learning_rate": 9.992733951963438e-05, + "loss": 0.1032, + "step": 199 + }, + { + "epoch": 0.5102040816326531, + "grad_norm": 1.7161622047424316, + "learning_rate": 9.992025753698411e-05, + "loss": 0.0757, + "step": 200 + }, + { + "epoch": 0.5127551020408163, + "grad_norm": 2.367258071899414, + "learning_rate": 9.991284669743255e-05, + "loss": 0.5715, + "step": 201 + }, + { + "epoch": 0.5153061224489796, + "grad_norm": 0.9766954779624939, + "learning_rate": 9.990510707426823e-05, + "loss": 0.0359, + "step": 202 + }, + { + "epoch": 0.5178571428571429, + "grad_norm": 1.4801597595214844, + "learning_rate": 9.989703874403109e-05, + "loss": 0.0845, + "step": 203 + }, + { + "epoch": 0.5204081632653061, + "grad_norm": 1.728097677230835, + "learning_rate": 9.988864178651178e-05, + "loss": 0.0776, + "step": 204 + }, + { + "epoch": 0.5229591836734694, + "grad_norm": 2.287801504135132, + "learning_rate": 9.987991628475088e-05, + "loss": 0.154, + "step": 205 + }, + { + "epoch": 0.5255102040816326, + "grad_norm": 1.1805185079574585, + "learning_rate": 9.987086232503811e-05, + "loss": 0.0553, + "step": 206 + }, + { + "epoch": 0.5280612244897959, + "grad_norm": 1.843317985534668, + "learning_rate": 9.986147999691133e-05, + "loss": 0.0871, + "step": 207 + }, + { + "epoch": 0.5306122448979592, + "grad_norm": 1.7194178104400635, + "learning_rate": 9.985176939315584e-05, + "loss": 0.1214, + "step": 208 + }, + { + "epoch": 0.5331632653061225, + "grad_norm": 2.5024309158325195, + "learning_rate": 9.984173060980331e-05, + "loss": 0.1983, + "step": 209 + }, + { + "epoch": 0.5357142857142857, + "grad_norm": 1.6570473909378052, + "learning_rate": 9.983136374613098e-05, + "loss": 0.1244, + "step": 210 + }, + { + "epoch": 0.5382653061224489, + "grad_norm": 1.2387135028839111, + "learning_rate": 9.98206689046605e-05, + "loss": 0.0517, + "step": 211 + }, + { + "epoch": 0.5408163265306123, + "grad_norm": 2.1405575275421143, + "learning_rate": 9.980964619115711e-05, + "loss": 0.1522, + "step": 212 + }, + { + "epoch": 0.5433673469387755, + "grad_norm": 1.5599679946899414, + "learning_rate": 9.979829571462843e-05, + "loss": 0.0749, + "step": 213 + }, + { + "epoch": 0.5459183673469388, + "grad_norm": 1.7910748720169067, + "learning_rate": 9.978661758732344e-05, + "loss": 0.0966, + "step": 214 + }, + { + "epoch": 0.548469387755102, + "grad_norm": 2.0318689346313477, + "learning_rate": 9.977461192473145e-05, + "loss": 0.1224, + "step": 215 + }, + { + "epoch": 0.5510204081632653, + "grad_norm": 2.262890100479126, + "learning_rate": 9.97622788455808e-05, + "loss": 0.2397, + "step": 216 + }, + { + "epoch": 0.5535714285714286, + "grad_norm": 1.402499794960022, + "learning_rate": 9.974961847183784e-05, + "loss": 0.0847, + "step": 217 + }, + { + "epoch": 0.5561224489795918, + "grad_norm": 0.6643227338790894, + "learning_rate": 9.973663092870562e-05, + "loss": 0.0252, + "step": 218 + }, + { + "epoch": 0.5586734693877551, + "grad_norm": 1.9640554189682007, + "learning_rate": 9.97233163446227e-05, + "loss": 0.1269, + "step": 219 + }, + { + "epoch": 0.5612244897959183, + "grad_norm": 1.900482416152954, + "learning_rate": 9.970967485126186e-05, + "loss": 0.1205, + "step": 220 + }, + { + "epoch": 0.5637755102040817, + "grad_norm": 1.2652589082717896, + "learning_rate": 9.96957065835288e-05, + "loss": 0.046, + "step": 221 + }, + { + "epoch": 0.5663265306122449, + "grad_norm": 1.6186816692352295, + "learning_rate": 9.968141167956084e-05, + "loss": 0.0701, + "step": 222 + }, + { + "epoch": 0.5688775510204082, + "grad_norm": 1.8102420568466187, + "learning_rate": 9.966679028072548e-05, + "loss": 0.1206, + "step": 223 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 1.285929799079895, + "learning_rate": 9.965184253161908e-05, + "loss": 0.059, + "step": 224 + }, + { + "epoch": 0.5739795918367347, + "grad_norm": 2.2748634815216064, + "learning_rate": 9.96365685800654e-05, + "loss": 0.1602, + "step": 225 + }, + { + "epoch": 0.576530612244898, + "grad_norm": 1.5005991458892822, + "learning_rate": 9.962096857711409e-05, + "loss": 0.098, + "step": 226 + }, + { + "epoch": 0.5790816326530612, + "grad_norm": 1.4743398427963257, + "learning_rate": 9.960504267703934e-05, + "loss": 0.0658, + "step": 227 + }, + { + "epoch": 0.5816326530612245, + "grad_norm": 1.6295772790908813, + "learning_rate": 9.958879103733811e-05, + "loss": 0.0755, + "step": 228 + }, + { + "epoch": 0.5841836734693877, + "grad_norm": 1.5652583837509155, + "learning_rate": 9.957221381872888e-05, + "loss": 0.1011, + "step": 229 + }, + { + "epoch": 0.5867346938775511, + "grad_norm": 2.0053470134735107, + "learning_rate": 9.955531118514976e-05, + "loss": 0.1612, + "step": 230 + }, + { + "epoch": 0.5892857142857143, + "grad_norm": 0.6709102988243103, + "learning_rate": 9.953808330375706e-05, + "loss": 0.0268, + "step": 231 + }, + { + "epoch": 0.5918367346938775, + "grad_norm": 1.4081071615219116, + "learning_rate": 9.952053034492364e-05, + "loss": 0.0478, + "step": 232 + }, + { + "epoch": 0.5943877551020408, + "grad_norm": 1.6391704082489014, + "learning_rate": 9.950265248223707e-05, + "loss": 0.0741, + "step": 233 + }, + { + "epoch": 0.5969387755102041, + "grad_norm": 1.5504906177520752, + "learning_rate": 9.948444989249808e-05, + "loss": 0.0985, + "step": 234 + }, + { + "epoch": 0.5994897959183674, + "grad_norm": 1.8062090873718262, + "learning_rate": 9.946592275571873e-05, + "loss": 0.0736, + "step": 235 + }, + { + "epoch": 0.6020408163265306, + "grad_norm": 1.7001596689224243, + "learning_rate": 9.944707125512063e-05, + "loss": 0.1142, + "step": 236 + }, + { + "epoch": 0.6020408163265306, + "eval_NLI_loss": 0.7994421720504761, + "eval_NLI_runtime": 7.1591, + "eval_NLI_samples_per_second": 11.873, + "eval_NLI_steps_per_second": 0.14, + "eval_Qnli-dev_cosine_accuracy": 0.7265625, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.6457405090332031, + "eval_Qnli-dev_cosine_ap": 0.7331921735040308, + "eval_Qnli-dev_cosine_f1": 0.6956521739130435, + "eval_Qnli-dev_cosine_f1_threshold": 0.5757047533988953, + "eval_Qnli-dev_cosine_mcc": 0.37357604001642775, + "eval_Qnli-dev_cosine_precision": 0.6075949367088608, + "eval_Qnli-dev_cosine_recall": 0.8135593220338984, + "eval_allNLI-dev_cosine_accuracy": 0.78125, + "eval_allNLI-dev_cosine_accuracy_threshold": 0.7120361328125, + "eval_allNLI-dev_cosine_ap": 0.676708312028844, + "eval_allNLI-dev_cosine_f1": 0.6990291262135923, + "eval_allNLI-dev_cosine_f1_threshold": 0.660472571849823, + "eval_allNLI-dev_cosine_mcc": 0.5251630700083426, + "eval_allNLI-dev_cosine_precision": 0.6, + "eval_allNLI-dev_cosine_recall": 0.8372093023255814, + "eval_sequential_score": 0.7331921735040308, + "eval_sts-test_pearson_cosine": 0.9087666657124194, + "eval_sts-test_spearman_cosine": 0.9172235092747959, + "step": 236 + }, + { + "epoch": 0.6020408163265306, + "eval_natural-questions_loss": 0.1305474191904068, + "eval_natural-questions_runtime": 42.154, + "eval_natural-questions_samples_per_second": 2.681, + "eval_natural-questions_steps_per_second": 0.024, + "step": 236 + }, + { + "epoch": 0.6020408163265306, + "eval_vitaminc_loss": 2.355496644973755, + "eval_vitaminc_runtime": 1.5088, + "eval_vitaminc_samples_per_second": 74.894, + "eval_vitaminc_steps_per_second": 0.663, + "step": 236 + }, + { + "epoch": 0.6020408163265306, + "eval_xsum_loss": 0.061459288001060486, + "eval_xsum_runtime": 7.5992, + "eval_xsum_samples_per_second": 14.87, + "eval_xsum_steps_per_second": 0.132, + "step": 236 + }, + { + "epoch": 0.6020408163265306, + "eval_paws_loss": 0.02127019129693508, + "eval_paws_runtime": 1.3725, + "eval_paws_samples_per_second": 82.332, + "eval_paws_steps_per_second": 0.729, + "step": 236 + }, + { + "epoch": 0.6020408163265306, + "eval_global_dataset_loss": 0.2741450369358063, + "eval_global_dataset_runtime": 15.8562, + "eval_global_dataset_samples_per_second": 16.145, + "eval_global_dataset_steps_per_second": 0.063, + "step": 236 + }, + { + "epoch": 0.6045918367346939, + "grad_norm": 1.6400598287582397, + "learning_rate": 9.942789557713317e-05, + "loss": 0.1271, + "step": 237 + }, + { + "epoch": 0.6071428571428571, + "grad_norm": 1.3375244140625, + "learning_rate": 9.940839591139161e-05, + "loss": 0.061, + "step": 238 + }, + { + "epoch": 0.6096938775510204, + "grad_norm": 1.3930450677871704, + "learning_rate": 9.93885724507353e-05, + "loss": 0.0756, + "step": 239 + }, + { + "epoch": 0.6122448979591837, + "grad_norm": 1.9619495868682861, + "learning_rate": 9.93684253912056e-05, + "loss": 0.0948, + "step": 240 + }, + { + "epoch": 0.6147959183673469, + "grad_norm": 1.8124289512634277, + "learning_rate": 9.934795493204423e-05, + "loss": 0.1604, + "step": 241 + }, + { + "epoch": 0.6173469387755102, + "grad_norm": 0.2577800452709198, + "learning_rate": 9.9327161275691e-05, + "loss": 0.0668, + "step": 242 + }, + { + "epoch": 0.6198979591836735, + "grad_norm": 0.7307854294776917, + "learning_rate": 9.930604462778195e-05, + "loss": 0.0386, + "step": 243 + }, + { + "epoch": 0.6224489795918368, + "grad_norm": 1.9956462383270264, + "learning_rate": 9.928460519714733e-05, + "loss": 0.1708, + "step": 244 + }, + { + "epoch": 0.625, + "grad_norm": 1.7439672946929932, + "learning_rate": 9.926284319580952e-05, + "loss": 0.0829, + "step": 245 + }, + { + "epoch": 0.6275510204081632, + "grad_norm": 2.449082136154175, + "learning_rate": 9.924075883898087e-05, + "loss": 0.1878, + "step": 246 + }, + { + "epoch": 0.6301020408163265, + "grad_norm": 1.7873239517211914, + "learning_rate": 9.921835234506165e-05, + "loss": 0.1039, + "step": 247 + }, + { + "epoch": 0.6326530612244898, + "grad_norm": 1.0824297666549683, + "learning_rate": 9.919562393563788e-05, + "loss": 0.064, + "step": 248 + }, + { + "epoch": 0.6352040816326531, + "grad_norm": 1.9150983095169067, + "learning_rate": 9.917257383547909e-05, + "loss": 0.106, + "step": 249 + }, + { + "epoch": 0.6377551020408163, + "grad_norm": 2.432682991027832, + "learning_rate": 9.914920227253612e-05, + "loss": 0.1597, + "step": 250 + }, + { + "epoch": 0.6403061224489796, + "grad_norm": 3.333723306655884, + "learning_rate": 9.91255094779389e-05, + "loss": 0.4868, + "step": 251 + }, + { + "epoch": 0.6428571428571429, + "grad_norm": 2.1557610034942627, + "learning_rate": 9.910149568599413e-05, + "loss": 0.1583, + "step": 252 + }, + { + "epoch": 0.6454081632653061, + "grad_norm": 1.764701247215271, + "learning_rate": 9.907716113418297e-05, + "loss": 0.0839, + "step": 253 + }, + { + "epoch": 0.6479591836734694, + "grad_norm": 1.5699714422225952, + "learning_rate": 9.905250606315868e-05, + "loss": 0.071, + "step": 254 + }, + { + "epoch": 0.6505102040816326, + "grad_norm": 2.7636210918426514, + "learning_rate": 9.902753071674426e-05, + "loss": 0.1673, + "step": 255 + }, + { + "epoch": 0.6530612244897959, + "grad_norm": 2.9466187953948975, + "learning_rate": 9.900223534193002e-05, + "loss": 0.5533, + "step": 256 + }, + { + "epoch": 0.6556122448979592, + "grad_norm": 1.628265142440796, + "learning_rate": 9.897662018887119e-05, + "loss": 0.1301, + "step": 257 + }, + { + "epoch": 0.6581632653061225, + "grad_norm": 1.7887464761734009, + "learning_rate": 9.895068551088533e-05, + "loss": 0.085, + "step": 258 + }, + { + "epoch": 0.6607142857142857, + "grad_norm": 1.1941776275634766, + "learning_rate": 9.892443156444997e-05, + "loss": 0.0545, + "step": 259 + }, + { + "epoch": 0.6632653061224489, + "grad_norm": 0.9058825373649597, + "learning_rate": 9.889785860919996e-05, + "loss": 0.0408, + "step": 260 + }, + { + "epoch": 0.6658163265306123, + "grad_norm": 3.7213516235351562, + "learning_rate": 9.887096690792495e-05, + "loss": 0.6112, + "step": 261 + }, + { + "epoch": 0.6683673469387755, + "grad_norm": 2.214836359024048, + "learning_rate": 9.884375672656679e-05, + "loss": 0.1493, + "step": 262 + }, + { + "epoch": 0.6709183673469388, + "grad_norm": 2.141777992248535, + "learning_rate": 9.881622833421691e-05, + "loss": 0.1581, + "step": 263 + }, + { + "epoch": 0.673469387755102, + "grad_norm": 2.498159408569336, + "learning_rate": 9.878838200311365e-05, + "loss": 0.2356, + "step": 264 + }, + { + "epoch": 0.6760204081632653, + "grad_norm": 2.3548645973205566, + "learning_rate": 9.876021800863948e-05, + "loss": 0.1972, + "step": 265 + }, + { + "epoch": 0.6785714285714286, + "grad_norm": 1.3429771661758423, + "learning_rate": 9.873173662931851e-05, + "loss": 0.0527, + "step": 266 + }, + { + "epoch": 0.6811224489795918, + "grad_norm": 1.6313376426696777, + "learning_rate": 9.870293814681344e-05, + "loss": 0.1335, + "step": 267 + }, + { + "epoch": 0.6836734693877551, + "grad_norm": 1.5429177284240723, + "learning_rate": 9.867382284592299e-05, + "loss": 0.0674, + "step": 268 + }, + { + "epoch": 0.6862244897959183, + "grad_norm": 1.6669093370437622, + "learning_rate": 9.864439101457903e-05, + "loss": 0.0656, + "step": 269 + }, + { + "epoch": 0.6887755102040817, + "grad_norm": 1.487358570098877, + "learning_rate": 9.861464294384363e-05, + "loss": 0.0622, + "step": 270 + }, + { + "epoch": 0.6913265306122449, + "grad_norm": 2.2861199378967285, + "learning_rate": 9.858457892790638e-05, + "loss": 0.2093, + "step": 271 + }, + { + "epoch": 0.6938775510204082, + "grad_norm": 1.0644478797912598, + "learning_rate": 9.855419926408127e-05, + "loss": 0.0605, + "step": 272 + }, + { + "epoch": 0.6964285714285714, + "grad_norm": 1.9579869508743286, + "learning_rate": 9.852350425280392e-05, + "loss": 0.117, + "step": 273 + }, + { + "epoch": 0.6989795918367347, + "grad_norm": 1.9787768125534058, + "learning_rate": 9.849249419762848e-05, + "loss": 0.0991, + "step": 274 + }, + { + "epoch": 0.701530612244898, + "grad_norm": 1.7187714576721191, + "learning_rate": 9.846116940522469e-05, + "loss": 0.1294, + "step": 275 + }, + { + "epoch": 0.7040816326530612, + "grad_norm": 1.254471778869629, + "learning_rate": 9.842953018537491e-05, + "loss": 0.0482, + "step": 276 + }, + { + "epoch": 0.7066326530612245, + "grad_norm": 1.1846364736557007, + "learning_rate": 9.839757685097088e-05, + "loss": 0.062, + "step": 277 + }, + { + "epoch": 0.7091836734693877, + "grad_norm": 2.0722057819366455, + "learning_rate": 9.836530971801085e-05, + "loss": 0.1289, + "step": 278 + }, + { + "epoch": 0.7117346938775511, + "grad_norm": 1.923488974571228, + "learning_rate": 9.833272910559626e-05, + "loss": 0.103, + "step": 279 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 2.131899356842041, + "learning_rate": 9.829983533592863e-05, + "loss": 0.1764, + "step": 280 + }, + { + "epoch": 0.7168367346938775, + "grad_norm": 1.982722520828247, + "learning_rate": 9.826662873430652e-05, + "loss": 0.1517, + "step": 281 + }, + { + "epoch": 0.7193877551020408, + "grad_norm": 1.7337759733200073, + "learning_rate": 9.823310962912211e-05, + "loss": 0.128, + "step": 282 + }, + { + "epoch": 0.7219387755102041, + "grad_norm": 2.05423903465271, + "learning_rate": 9.81992783518581e-05, + "loss": 0.1119, + "step": 283 + }, + { + "epoch": 0.7244897959183674, + "grad_norm": 1.3350353240966797, + "learning_rate": 9.816513523708429e-05, + "loss": 0.0813, + "step": 284 + }, + { + "epoch": 0.7270408163265306, + "grad_norm": 1.2914992570877075, + "learning_rate": 9.813068062245446e-05, + "loss": 0.0525, + "step": 285 + }, + { + "epoch": 0.7295918367346939, + "grad_norm": 2.0979316234588623, + "learning_rate": 9.809591484870282e-05, + "loss": 0.1221, + "step": 286 + }, + { + "epoch": 0.7321428571428571, + "grad_norm": 1.5068845748901367, + "learning_rate": 9.806083825964085e-05, + "loss": 0.0645, + "step": 287 + }, + { + "epoch": 0.7346938775510204, + "grad_norm": 1.7337675094604492, + "learning_rate": 9.802545120215372e-05, + "loss": 0.1155, + "step": 288 + }, + { + "epoch": 0.7372448979591837, + "grad_norm": 1.4691200256347656, + "learning_rate": 9.798975402619696e-05, + "loss": 0.0854, + "step": 289 + }, + { + "epoch": 0.7397959183673469, + "grad_norm": 1.3618502616882324, + "learning_rate": 9.795374708479298e-05, + "loss": 0.0759, + "step": 290 + }, + { + "epoch": 0.7423469387755102, + "grad_norm": 1.5375083684921265, + "learning_rate": 9.791743073402759e-05, + "loss": 0.0795, + "step": 291 + }, + { + "epoch": 0.7448979591836735, + "grad_norm": 1.5201373100280762, + "learning_rate": 9.788080533304642e-05, + "loss": 0.0842, + "step": 292 + }, + { + "epoch": 0.7474489795918368, + "grad_norm": 2.1918747425079346, + "learning_rate": 9.784387124405144e-05, + "loss": 0.1039, + "step": 293 + }, + { + "epoch": 0.75, + "grad_norm": 1.2931500673294067, + "learning_rate": 9.780662883229734e-05, + "loss": 0.0525, + "step": 294 + }, + { + "epoch": 0.7525510204081632, + "grad_norm": 1.53117036819458, + "learning_rate": 9.776907846608792e-05, + "loss": 0.0807, + "step": 295 + }, + { + "epoch": 0.7525510204081632, + "eval_NLI_loss": 0.812004566192627, + "eval_NLI_runtime": 7.1567, + "eval_NLI_samples_per_second": 11.877, + "eval_NLI_steps_per_second": 0.14, + "eval_Qnli-dev_cosine_accuracy": 0.71875, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.7058789730072021, + "eval_Qnli-dev_cosine_ap": 0.7280268374444242, + "eval_Qnli-dev_cosine_f1": 0.6887417218543046, + "eval_Qnli-dev_cosine_f1_threshold": 0.5248329639434814, + "eval_Qnli-dev_cosine_mcc": 0.3344280153577804, + "eval_Qnli-dev_cosine_precision": 0.5652173913043478, + "eval_Qnli-dev_cosine_recall": 0.8813559322033898, + "eval_allNLI-dev_cosine_accuracy": 0.7734375, + "eval_allNLI-dev_cosine_accuracy_threshold": 0.7263156175613403, + "eval_allNLI-dev_cosine_ap": 0.6781090048839976, + "eval_allNLI-dev_cosine_f1": 0.7155963302752293, + "eval_allNLI-dev_cosine_f1_threshold": 0.6493248343467712, + "eval_allNLI-dev_cosine_mcc": 0.5569730681100454, + "eval_allNLI-dev_cosine_precision": 0.5909090909090909, + "eval_allNLI-dev_cosine_recall": 0.9069767441860465, + "eval_sequential_score": 0.7280268374444242, + "eval_sts-test_pearson_cosine": 0.9044911125085072, + "eval_sts-test_spearman_cosine": 0.9157384386655222, + "step": 295 + }, + { + "epoch": 0.7525510204081632, + "eval_natural-questions_loss": 0.0891142338514328, + "eval_natural-questions_runtime": 42.1528, + "eval_natural-questions_samples_per_second": 2.681, + "eval_natural-questions_steps_per_second": 0.024, + "step": 295 + }, + { + "epoch": 0.7525510204081632, + "eval_vitaminc_loss": 2.49794340133667, + "eval_vitaminc_runtime": 1.5113, + "eval_vitaminc_samples_per_second": 74.768, + "eval_vitaminc_steps_per_second": 0.662, + "step": 295 + }, + { + "epoch": 0.7525510204081632, + "eval_xsum_loss": 0.07110549509525299, + "eval_xsum_runtime": 7.5876, + "eval_xsum_samples_per_second": 14.893, + "eval_xsum_steps_per_second": 0.132, + "step": 295 + }, + { + "epoch": 0.7525510204081632, + "eval_paws_loss": 0.020345555618405342, + "eval_paws_runtime": 1.3711, + "eval_paws_samples_per_second": 82.416, + "eval_paws_steps_per_second": 0.729, + "step": 295 + }, + { + "epoch": 0.7525510204081632, + "eval_global_dataset_loss": 0.2989472448825836, + "eval_global_dataset_runtime": 15.8529, + "eval_global_dataset_samples_per_second": 16.148, + "eval_global_dataset_steps_per_second": 0.063, + "step": 295 + }, + { + "epoch": 0.7551020408163265, + "grad_norm": 1.2729077339172363, + "learning_rate": 9.773122051677248e-05, + "loss": 0.059, + "step": 296 + }, + { + "epoch": 0.7576530612244898, + "grad_norm": 1.804408311843872, + "learning_rate": 9.769305535874207e-05, + "loss": 0.1255, + "step": 297 + }, + { + "epoch": 0.7602040816326531, + "grad_norm": 1.188318133354187, + "learning_rate": 9.765458336942592e-05, + "loss": 0.075, + "step": 298 + }, + { + "epoch": 0.7627551020408163, + "grad_norm": 1.393085241317749, + "learning_rate": 9.761580492928755e-05, + "loss": 0.0668, + "step": 299 + }, + { + "epoch": 0.7653061224489796, + "grad_norm": 1.7966115474700928, + "learning_rate": 9.757672042182109e-05, + "loss": 0.1193, + "step": 300 + }, + { + "epoch": 0.7678571428571429, + "grad_norm": 1.2745482921600342, + "learning_rate": 9.753733023354754e-05, + "loss": 0.0539, + "step": 301 + }, + { + "epoch": 0.7704081632653061, + "grad_norm": 1.2256300449371338, + "learning_rate": 9.749763475401086e-05, + "loss": 0.0465, + "step": 302 + }, + { + "epoch": 0.7729591836734694, + "grad_norm": 0.20624244213104248, + "learning_rate": 9.745763437577409e-05, + "loss": 0.0324, + "step": 303 + }, + { + "epoch": 0.7755102040816326, + "grad_norm": 1.6620599031448364, + "learning_rate": 9.741732949441564e-05, + "loss": 0.0797, + "step": 304 + }, + { + "epoch": 0.7780612244897959, + "grad_norm": 1.9513704776763916, + "learning_rate": 9.737672050852515e-05, + "loss": 0.1331, + "step": 305 + }, + { + "epoch": 0.7806122448979592, + "grad_norm": 1.4471361637115479, + "learning_rate": 9.733580781969973e-05, + "loss": 0.0831, + "step": 306 + }, + { + "epoch": 0.7831632653061225, + "grad_norm": 2.0116097927093506, + "learning_rate": 9.729459183253988e-05, + "loss": 0.1194, + "step": 307 + }, + { + "epoch": 0.7857142857142857, + "grad_norm": 2.288022756576538, + "learning_rate": 9.725307295464557e-05, + "loss": 0.1545, + "step": 308 + }, + { + "epoch": 0.7882653061224489, + "grad_norm": 1.9034236669540405, + "learning_rate": 9.72112515966121e-05, + "loss": 0.1273, + "step": 309 + }, + { + "epoch": 0.7908163265306123, + "grad_norm": 1.7381569147109985, + "learning_rate": 9.716912817202622e-05, + "loss": 0.0925, + "step": 310 + }, + { + "epoch": 0.7933673469387755, + "grad_norm": 1.9344810247421265, + "learning_rate": 9.71267030974618e-05, + "loss": 0.1217, + "step": 311 + }, + { + "epoch": 0.7959183673469388, + "grad_norm": 1.4519567489624023, + "learning_rate": 9.708397679247588e-05, + "loss": 0.0549, + "step": 312 + }, + { + "epoch": 0.798469387755102, + "grad_norm": 1.8634049892425537, + "learning_rate": 9.704094967960453e-05, + "loss": 0.1441, + "step": 313 + }, + { + "epoch": 0.8010204081632653, + "grad_norm": 1.7073299884796143, + "learning_rate": 9.699762218435857e-05, + "loss": 0.1327, + "step": 314 + }, + { + "epoch": 0.8035714285714286, + "grad_norm": 1.3812566995620728, + "learning_rate": 9.695399473521943e-05, + "loss": 0.0495, + "step": 315 + }, + { + "epoch": 0.8061224489795918, + "grad_norm": 1.206460952758789, + "learning_rate": 9.691006776363482e-05, + "loss": 0.0473, + "step": 316 + }, + { + "epoch": 0.8086734693877551, + "grad_norm": 1.505863070487976, + "learning_rate": 9.686584170401468e-05, + "loss": 0.1109, + "step": 317 + }, + { + "epoch": 0.8112244897959183, + "grad_norm": 1.9382115602493286, + "learning_rate": 9.682131699372661e-05, + "loss": 0.1102, + "step": 318 + }, + { + "epoch": 0.8137755102040817, + "grad_norm": 1.3306173086166382, + "learning_rate": 9.677649407309175e-05, + "loss": 0.0674, + "step": 319 + }, + { + "epoch": 0.8163265306122449, + "grad_norm": 2.13614821434021, + "learning_rate": 9.673137338538035e-05, + "loss": 0.1076, + "step": 320 + }, + { + "epoch": 0.8188775510204082, + "grad_norm": 1.6416414976119995, + "learning_rate": 9.668595537680729e-05, + "loss": 0.076, + "step": 321 + }, + { + "epoch": 0.8214285714285714, + "grad_norm": 1.5224190950393677, + "learning_rate": 9.664024049652791e-05, + "loss": 0.0899, + "step": 322 + }, + { + "epoch": 0.8239795918367347, + "grad_norm": 1.4623075723648071, + "learning_rate": 9.659422919663333e-05, + "loss": 0.0539, + "step": 323 + }, + { + "epoch": 0.826530612244898, + "grad_norm": 1.504521131515503, + "learning_rate": 9.654792193214609e-05, + "loss": 0.0516, + "step": 324 + }, + { + "epoch": 0.8290816326530612, + "grad_norm": 1.0403423309326172, + "learning_rate": 9.650131916101561e-05, + "loss": 0.0607, + "step": 325 + }, + { + "epoch": 0.8316326530612245, + "grad_norm": 2.122011661529541, + "learning_rate": 9.645442134411376e-05, + "loss": 0.1574, + "step": 326 + }, + { + "epoch": 0.8341836734693877, + "grad_norm": 1.7957897186279297, + "learning_rate": 9.640722894523014e-05, + "loss": 0.1324, + "step": 327 + }, + { + "epoch": 0.8367346938775511, + "grad_norm": 1.597949743270874, + "learning_rate": 9.635974243106762e-05, + "loss": 0.094, + "step": 328 + }, + { + "epoch": 0.8392857142857143, + "grad_norm": 1.577876091003418, + "learning_rate": 9.631196227123769e-05, + "loss": 0.0861, + "step": 329 + }, + { + "epoch": 0.8418367346938775, + "grad_norm": 2.038119316101074, + "learning_rate": 9.626388893825578e-05, + "loss": 0.0991, + "step": 330 + }, + { + "epoch": 0.8443877551020408, + "grad_norm": 0.8963376879692078, + "learning_rate": 9.621552290753663e-05, + "loss": 0.03, + "step": 331 + }, + { + "epoch": 0.8469387755102041, + "grad_norm": 1.5942103862762451, + "learning_rate": 9.616686465738959e-05, + "loss": 0.0785, + "step": 332 + }, + { + "epoch": 0.8494897959183674, + "grad_norm": 1.5355104207992554, + "learning_rate": 9.611791466901386e-05, + "loss": 0.1042, + "step": 333 + }, + { + "epoch": 0.8520408163265306, + "grad_norm": 0.6599818468093872, + "learning_rate": 9.606867342649373e-05, + "loss": 0.0184, + "step": 334 + }, + { + "epoch": 0.8545918367346939, + "grad_norm": 0.8163981437683105, + "learning_rate": 9.601914141679382e-05, + "loss": 0.033, + "step": 335 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 1.3256895542144775, + "learning_rate": 9.596931912975431e-05, + "loss": 0.0593, + "step": 336 + }, + { + "epoch": 0.8596938775510204, + "grad_norm": 1.4203654527664185, + "learning_rate": 9.591920705808593e-05, + "loss": 0.0798, + "step": 337 + }, + { + "epoch": 0.8622448979591837, + "grad_norm": 1.540278673171997, + "learning_rate": 9.58688056973653e-05, + "loss": 0.0746, + "step": 338 + }, + { + "epoch": 0.8647959183673469, + "grad_norm": 1.3792699575424194, + "learning_rate": 9.581811554602981e-05, + "loss": 0.0629, + "step": 339 + }, + { + "epoch": 0.8673469387755102, + "grad_norm": 1.1846177577972412, + "learning_rate": 9.576713710537295e-05, + "loss": 0.0401, + "step": 340 + }, + { + "epoch": 0.8698979591836735, + "grad_norm": 3.153242826461792, + "learning_rate": 9.571587087953911e-05, + "loss": 0.2941, + "step": 341 + }, + { + "epoch": 0.8724489795918368, + "grad_norm": 1.3724006414413452, + "learning_rate": 9.566431737551866e-05, + "loss": 0.0796, + "step": 342 + }, + { + "epoch": 0.875, + "grad_norm": 1.2515268325805664, + "learning_rate": 9.561247710314309e-05, + "loss": 0.0661, + "step": 343 + }, + { + "epoch": 0.8775510204081632, + "grad_norm": 1.4603686332702637, + "learning_rate": 9.556035057507978e-05, + "loss": 0.1058, + "step": 344 + }, + { + "epoch": 0.8801020408163265, + "grad_norm": 2.1749563217163086, + "learning_rate": 9.550793830682696e-05, + "loss": 0.1774, + "step": 345 + }, + { + "epoch": 0.8826530612244898, + "grad_norm": 1.797114372253418, + "learning_rate": 9.54552408167087e-05, + "loss": 0.1408, + "step": 346 + }, + { + "epoch": 0.8852040816326531, + "grad_norm": 1.0089930295944214, + "learning_rate": 9.540225862586974e-05, + "loss": 0.0373, + "step": 347 + }, + { + "epoch": 0.8877551020408163, + "grad_norm": 1.7380998134613037, + "learning_rate": 9.534899225827027e-05, + "loss": 0.0758, + "step": 348 + }, + { + "epoch": 0.8903061224489796, + "grad_norm": 1.680227518081665, + "learning_rate": 9.529544224068087e-05, + "loss": 0.0997, + "step": 349 + }, + { + "epoch": 0.8928571428571429, + "grad_norm": 0.7344462871551514, + "learning_rate": 9.524160910267724e-05, + "loss": 0.045, + "step": 350 + }, + { + "epoch": 0.8954081632653061, + "grad_norm": 0.7496012449264526, + "learning_rate": 9.518749337663491e-05, + "loss": 0.0246, + "step": 351 + }, + { + "epoch": 0.8979591836734694, + "grad_norm": 1.4923900365829468, + "learning_rate": 9.513309559772403e-05, + "loss": 0.0645, + "step": 352 + }, + { + "epoch": 0.9005102040816326, + "grad_norm": 1.4803483486175537, + "learning_rate": 9.507841630390415e-05, + "loss": 0.1046, + "step": 353 + }, + { + "epoch": 0.9030612244897959, + "grad_norm": 1.7457276582717896, + "learning_rate": 9.502345603591873e-05, + "loss": 0.0857, + "step": 354 + }, + { + "epoch": 0.9030612244897959, + "eval_NLI_loss": 0.8229475617408752, + "eval_NLI_runtime": 7.1574, + "eval_NLI_samples_per_second": 11.876, + "eval_NLI_steps_per_second": 0.14, + "eval_Qnli-dev_cosine_accuracy": 0.7109375, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.7116010189056396, + "eval_Qnli-dev_cosine_ap": 0.7346738886839252, + "eval_Qnli-dev_cosine_f1": 0.7092198581560283, + "eval_Qnli-dev_cosine_f1_threshold": 0.5781652927398682, + "eval_Qnli-dev_cosine_mcc": 0.3986067385792586, + "eval_Qnli-dev_cosine_precision": 0.6097560975609756, + "eval_Qnli-dev_cosine_recall": 0.847457627118644, + "eval_allNLI-dev_cosine_accuracy": 0.7734375, + "eval_allNLI-dev_cosine_accuracy_threshold": 0.716580331325531, + "eval_allNLI-dev_cosine_ap": 0.669835314309871, + "eval_allNLI-dev_cosine_f1": 0.6964285714285715, + "eval_allNLI-dev_cosine_f1_threshold": 0.6203938126564026, + "eval_allNLI-dev_cosine_mcc": 0.5249655273153817, + "eval_allNLI-dev_cosine_precision": 0.5652173913043478, + "eval_allNLI-dev_cosine_recall": 0.9069767441860465, + "eval_sequential_score": 0.7346738886839252, + "eval_sts-test_pearson_cosine": 0.9033761016886019, + "eval_sts-test_spearman_cosine": 0.9159521259386664, + "step": 354 + }, + { + "epoch": 0.9030612244897959, + "eval_natural-questions_loss": 0.10953477770090103, + "eval_natural-questions_runtime": 42.1508, + "eval_natural-questions_samples_per_second": 2.681, + "eval_natural-questions_steps_per_second": 0.024, + "step": 354 + }, + { + "epoch": 0.9030612244897959, + "eval_vitaminc_loss": 2.67388653755188, + "eval_vitaminc_runtime": 1.5126, + "eval_vitaminc_samples_per_second": 74.705, + "eval_vitaminc_steps_per_second": 0.661, + "step": 354 + }, + { + "epoch": 0.9030612244897959, + "eval_xsum_loss": 0.08486746996641159, + "eval_xsum_runtime": 7.6187, + "eval_xsum_samples_per_second": 14.832, + "eval_xsum_steps_per_second": 0.131, + "step": 354 + }, + { + "epoch": 0.9030612244897959, + "eval_paws_loss": 0.020648421719670296, + "eval_paws_runtime": 1.3717, + "eval_paws_samples_per_second": 82.377, + "eval_paws_steps_per_second": 0.729, + "step": 354 + }, + { + "epoch": 0.9030612244897959, + "eval_global_dataset_loss": 0.302554726600647, + "eval_global_dataset_runtime": 15.8515, + "eval_global_dataset_samples_per_second": 16.15, + "eval_global_dataset_steps_per_second": 0.063, + "step": 354 + }, + { + "epoch": 0.9056122448979592, + "grad_norm": 1.8654156923294067, + "learning_rate": 9.496821533728994e-05, + "loss": 0.1487, + "step": 355 + }, + { + "epoch": 0.9081632653061225, + "grad_norm": 1.6592378616333008, + "learning_rate": 9.491269475431322e-05, + "loss": 0.0759, + "step": 356 + }, + { + "epoch": 0.9107142857142857, + "grad_norm": 1.7628471851348877, + "learning_rate": 9.48568948360519e-05, + "loss": 0.1082, + "step": 357 + }, + { + "epoch": 0.9132653061224489, + "grad_norm": 1.9294559955596924, + "learning_rate": 9.480081613433169e-05, + "loss": 0.1135, + "step": 358 + }, + { + "epoch": 0.9158163265306123, + "grad_norm": 2.2890355587005615, + "learning_rate": 9.474445920373541e-05, + "loss": 0.1303, + "step": 359 + }, + { + "epoch": 0.9183673469387755, + "grad_norm": 1.7488317489624023, + "learning_rate": 9.468782460159729e-05, + "loss": 0.0862, + "step": 360 + }, + { + "epoch": 0.9209183673469388, + "grad_norm": 1.127173900604248, + "learning_rate": 9.46309128879976e-05, + "loss": 0.0763, + "step": 361 + }, + { + "epoch": 0.923469387755102, + "grad_norm": 1.5844143629074097, + "learning_rate": 9.457372462575704e-05, + "loss": 0.0839, + "step": 362 + }, + { + "epoch": 0.9260204081632653, + "grad_norm": 1.4391447305679321, + "learning_rate": 9.451626038043122e-05, + "loss": 0.0715, + "step": 363 + }, + { + "epoch": 0.9285714285714286, + "grad_norm": 1.5871800184249878, + "learning_rate": 9.445852072030507e-05, + "loss": 0.0517, + "step": 364 + }, + { + "epoch": 0.9311224489795918, + "grad_norm": 1.5403629541397095, + "learning_rate": 9.440050621638712e-05, + "loss": 0.0902, + "step": 365 + }, + { + "epoch": 0.9336734693877551, + "grad_norm": 1.0280388593673706, + "learning_rate": 9.434221744240403e-05, + "loss": 0.0296, + "step": 366 + }, + { + "epoch": 0.9362244897959183, + "grad_norm": 1.750827431678772, + "learning_rate": 9.428365497479474e-05, + "loss": 0.1111, + "step": 367 + }, + { + "epoch": 0.9387755102040817, + "grad_norm": 1.4274876117706299, + "learning_rate": 9.422481939270489e-05, + "loss": 0.0777, + "step": 368 + }, + { + "epoch": 0.9413265306122449, + "grad_norm": 1.5455957651138306, + "learning_rate": 9.416571127798102e-05, + "loss": 0.0655, + "step": 369 + }, + { + "epoch": 0.9438775510204082, + "grad_norm": 1.4600716829299927, + "learning_rate": 9.410633121516486e-05, + "loss": 0.0833, + "step": 370 + }, + { + "epoch": 0.9464285714285714, + "grad_norm": 1.2145413160324097, + "learning_rate": 9.404667979148752e-05, + "loss": 0.0711, + "step": 371 + }, + { + "epoch": 0.9489795918367347, + "grad_norm": 1.393937587738037, + "learning_rate": 9.398675759686373e-05, + "loss": 0.0473, + "step": 372 + }, + { + "epoch": 0.951530612244898, + "grad_norm": 1.2119865417480469, + "learning_rate": 9.392656522388595e-05, + "loss": 0.0376, + "step": 373 + }, + { + "epoch": 0.9540816326530612, + "grad_norm": 1.1728087663650513, + "learning_rate": 9.38661032678185e-05, + "loss": 0.0859, + "step": 374 + }, + { + "epoch": 0.9566326530612245, + "grad_norm": 1.5121995210647583, + "learning_rate": 9.380537232659177e-05, + "loss": 0.0958, + "step": 375 + }, + { + "epoch": 0.9591836734693877, + "grad_norm": 1.415404200553894, + "learning_rate": 9.374437300079621e-05, + "loss": 0.0709, + "step": 376 + }, + { + "epoch": 0.9617346938775511, + "grad_norm": 1.3736704587936401, + "learning_rate": 9.368310589367641e-05, + "loss": 0.0674, + "step": 377 + }, + { + "epoch": 0.9642857142857143, + "grad_norm": 1.4186502695083618, + "learning_rate": 9.362157161112516e-05, + "loss": 0.0601, + "step": 378 + }, + { + "epoch": 0.9668367346938775, + "grad_norm": 1.5026627779006958, + "learning_rate": 9.35597707616775e-05, + "loss": 0.0953, + "step": 379 + }, + { + "epoch": 0.9693877551020408, + "grad_norm": 1.4679648876190186, + "learning_rate": 9.34977039565045e-05, + "loss": 0.069, + "step": 380 + }, + { + "epoch": 0.9719387755102041, + "grad_norm": 1.8390185832977295, + "learning_rate": 9.343537180940754e-05, + "loss": 0.0953, + "step": 381 + }, + { + "epoch": 0.9744897959183674, + "grad_norm": 1.5337008237838745, + "learning_rate": 9.337277493681193e-05, + "loss": 0.1069, + "step": 382 + }, + { + "epoch": 0.9770408163265306, + "grad_norm": 1.9199270009994507, + "learning_rate": 9.330991395776103e-05, + "loss": 0.149, + "step": 383 + }, + { + "epoch": 0.9795918367346939, + "grad_norm": 1.5701026916503906, + "learning_rate": 9.324678949390997e-05, + "loss": 0.1037, + "step": 384 + }, + { + "epoch": 0.9821428571428571, + "grad_norm": 1.501617193222046, + "learning_rate": 9.318340216951968e-05, + "loss": 0.0856, + "step": 385 + }, + { + "epoch": 0.9846938775510204, + "grad_norm": 1.3909822702407837, + "learning_rate": 9.311975261145051e-05, + "loss": 0.0465, + "step": 386 + }, + { + "epoch": 0.9872448979591837, + "grad_norm": 2.011550188064575, + "learning_rate": 9.30558414491562e-05, + "loss": 0.1756, + "step": 387 + }, + { + "epoch": 0.9897959183673469, + "grad_norm": 1.9593795537948608, + "learning_rate": 9.299166931467755e-05, + "loss": 0.0972, + "step": 388 + }, + { + "epoch": 0.9923469387755102, + "grad_norm": 1.3691438436508179, + "learning_rate": 9.292723684263624e-05, + "loss": 0.0868, + "step": 389 + }, + { + "epoch": 0.9948979591836735, + "grad_norm": 1.6974234580993652, + "learning_rate": 9.28625446702285e-05, + "loss": 0.1441, + "step": 390 + }, + { + "epoch": 0.9974489795918368, + "grad_norm": 1.8705288171768188, + "learning_rate": 9.279759343721888e-05, + "loss": 0.1154, + "step": 391 + }, + { + "epoch": 1.0, + "grad_norm": 0.007384690921753645, + "learning_rate": 9.273238378593378e-05, + "loss": 0.0001, + "step": 392 + }, + { + "epoch": 1.0025510204081634, + "grad_norm": 0.5929455161094666, + "learning_rate": 9.266691636125528e-05, + "loss": 0.0247, + "step": 393 + }, + { + "epoch": 1.0051020408163265, + "grad_norm": 1.6018315553665161, + "learning_rate": 9.260119181061466e-05, + "loss": 0.0772, + "step": 394 + }, + { + "epoch": 1.0076530612244898, + "grad_norm": 1.4430633783340454, + "learning_rate": 9.253521078398598e-05, + "loss": 0.078, + "step": 395 + }, + { + "epoch": 1.010204081632653, + "grad_norm": 1.3759105205535889, + "learning_rate": 9.24689739338797e-05, + "loss": 0.0985, + "step": 396 + }, + { + "epoch": 1.0127551020408163, + "grad_norm": 1.391664981842041, + "learning_rate": 9.240248191533622e-05, + "loss": 0.0509, + "step": 397 + }, + { + "epoch": 1.0153061224489797, + "grad_norm": 1.3567787408828735, + "learning_rate": 9.233573538591937e-05, + "loss": 0.0791, + "step": 398 + }, + { + "epoch": 1.0178571428571428, + "grad_norm": 1.7589006423950195, + "learning_rate": 9.226873500571e-05, + "loss": 0.1297, + "step": 399 + }, + { + "epoch": 1.0204081632653061, + "grad_norm": 0.83431476354599, + "learning_rate": 9.220148143729928e-05, + "loss": 0.0549, + "step": 400 + }, + { + "epoch": 1.0229591836734695, + "grad_norm": 0.7938280701637268, + "learning_rate": 9.213397534578232e-05, + "loss": 0.0274, + "step": 401 + }, + { + "epoch": 1.0255102040816326, + "grad_norm": 1.4939814805984497, + "learning_rate": 9.206621739875152e-05, + "loss": 0.0626, + "step": 402 + }, + { + "epoch": 1.028061224489796, + "grad_norm": 1.2483896017074585, + "learning_rate": 9.199820826628992e-05, + "loss": 0.0627, + "step": 403 + }, + { + "epoch": 1.030612244897959, + "grad_norm": 0.4027434289455414, + "learning_rate": 9.19299486209647e-05, + "loss": 0.0085, + "step": 404 + }, + { + "epoch": 1.0331632653061225, + "grad_norm": 0.9910751581192017, + "learning_rate": 9.186143913782042e-05, + "loss": 0.0883, + "step": 405 + }, + { + "epoch": 1.0357142857142858, + "grad_norm": 0.7903987169265747, + "learning_rate": 9.179268049437232e-05, + "loss": 0.0261, + "step": 406 + }, + { + "epoch": 1.038265306122449, + "grad_norm": 1.3881908655166626, + "learning_rate": 9.172367337059979e-05, + "loss": 0.0662, + "step": 407 + }, + { + "epoch": 1.0408163265306123, + "grad_norm": 1.541951060295105, + "learning_rate": 9.165441844893943e-05, + "loss": 0.088, + "step": 408 + }, + { + "epoch": 1.0433673469387754, + "grad_norm": 1.5444767475128174, + "learning_rate": 9.158491641427847e-05, + "loss": 0.0632, + "step": 409 + }, + { + "epoch": 1.0459183673469388, + "grad_norm": 1.365799069404602, + "learning_rate": 9.151516795394788e-05, + "loss": 0.0569, + "step": 410 + }, + { + "epoch": 1.0484693877551021, + "grad_norm": 1.0480999946594238, + "learning_rate": 9.144517375771568e-05, + "loss": 0.0421, + "step": 411 + }, + { + "epoch": 1.0510204081632653, + "grad_norm": 2.9614346027374268, + "learning_rate": 9.137493451778004e-05, + "loss": 0.2399, + "step": 412 + }, + { + "epoch": 1.0535714285714286, + "grad_norm": 1.0980393886566162, + "learning_rate": 9.130445092876238e-05, + "loss": 0.04, + "step": 413 + }, + { + "epoch": 1.0535714285714286, + "eval_NLI_loss": 0.7572864294052124, + "eval_NLI_runtime": 7.1469, + "eval_NLI_samples_per_second": 11.893, + "eval_NLI_steps_per_second": 0.14, + "eval_Qnli-dev_cosine_accuracy": 0.7109375, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.6783713698387146, + "eval_Qnli-dev_cosine_ap": 0.7335846443073419, + "eval_Qnli-dev_cosine_f1": 0.6986301369863013, + "eval_Qnli-dev_cosine_f1_threshold": 0.5374584197998047, + "eval_Qnli-dev_cosine_mcc": 0.3660767117214088, + "eval_Qnli-dev_cosine_precision": 0.5862068965517241, + "eval_Qnli-dev_cosine_recall": 0.864406779661017, + "eval_allNLI-dev_cosine_accuracy": 0.7890625, + "eval_allNLI-dev_cosine_accuracy_threshold": 0.7159559726715088, + "eval_allNLI-dev_cosine_ap": 0.6755797600974502, + "eval_allNLI-dev_cosine_f1": 0.7021276595744682, + "eval_allNLI-dev_cosine_f1_threshold": 0.693335235118866, + "eval_allNLI-dev_cosine_mcc": 0.5360876249970393, + "eval_allNLI-dev_cosine_precision": 0.6470588235294118, + "eval_allNLI-dev_cosine_recall": 0.7674418604651163, + "eval_sequential_score": 0.7335846443073419, + "eval_sts-test_pearson_cosine": 0.9025833410376156, + "eval_sts-test_spearman_cosine": 0.916960103350246, + "step": 413 + }, + { + "epoch": 1.0535714285714286, + "eval_natural-questions_loss": 0.12202154844999313, + "eval_natural-questions_runtime": 42.1995, + "eval_natural-questions_samples_per_second": 2.678, + "eval_natural-questions_steps_per_second": 0.024, + "step": 413 + }, + { + "epoch": 1.0535714285714286, + "eval_vitaminc_loss": 2.5621209144592285, + "eval_vitaminc_runtime": 1.5095, + "eval_vitaminc_samples_per_second": 74.86, + "eval_vitaminc_steps_per_second": 0.662, + "step": 413 + }, + { + "epoch": 1.0535714285714286, + "eval_xsum_loss": 0.08535028994083405, + "eval_xsum_runtime": 7.5904, + "eval_xsum_samples_per_second": 14.887, + "eval_xsum_steps_per_second": 0.132, + "step": 413 + }, + { + "epoch": 1.0535714285714286, + "eval_paws_loss": 0.020854558795690536, + "eval_paws_runtime": 1.3715, + "eval_paws_samples_per_second": 82.39, + "eval_paws_steps_per_second": 0.729, + "step": 413 + }, + { + "epoch": 1.0535714285714286, + "eval_global_dataset_loss": 0.31916457414627075, + "eval_global_dataset_runtime": 15.8824, + "eval_global_dataset_samples_per_second": 16.119, + "eval_global_dataset_steps_per_second": 0.063, + "step": 413 + }, + { + "epoch": 1.0561224489795917, + "grad_norm": 1.0013049840927124, + "learning_rate": 9.123372368770073e-05, + "loss": 0.0331, + "step": 414 + }, + { + "epoch": 1.058673469387755, + "grad_norm": 1.1385012865066528, + "learning_rate": 9.116275349404256e-05, + "loss": 0.0403, + "step": 415 + }, + { + "epoch": 1.0612244897959184, + "grad_norm": 1.5421454906463623, + "learning_rate": 9.109154104963802e-05, + "loss": 0.1124, + "step": 416 + }, + { + "epoch": 1.0637755102040816, + "grad_norm": 1.2339749336242676, + "learning_rate": 9.102008705873298e-05, + "loss": 0.0718, + "step": 417 + }, + { + "epoch": 1.066326530612245, + "grad_norm": 1.41034734249115, + "learning_rate": 9.094839222796205e-05, + "loss": 0.0764, + "step": 418 + }, + { + "epoch": 1.068877551020408, + "grad_norm": 2.008246898651123, + "learning_rate": 9.087645726634157e-05, + "loss": 0.1252, + "step": 419 + }, + { + "epoch": 1.0714285714285714, + "grad_norm": 1.067642331123352, + "learning_rate": 9.080428288526268e-05, + "loss": 0.0224, + "step": 420 + }, + { + "epoch": 1.0739795918367347, + "grad_norm": 1.124808669090271, + "learning_rate": 9.073186979848416e-05, + "loss": 0.0655, + "step": 421 + }, + { + "epoch": 1.0765306122448979, + "grad_norm": 0.34429118037223816, + "learning_rate": 9.065921872212545e-05, + "loss": 0.0611, + "step": 422 + }, + { + "epoch": 1.0790816326530612, + "grad_norm": 1.14228093624115, + "learning_rate": 9.05863303746596e-05, + "loss": 0.0579, + "step": 423 + }, + { + "epoch": 1.0816326530612246, + "grad_norm": 1.2994554042816162, + "learning_rate": 9.051320547690607e-05, + "loss": 0.0462, + "step": 424 + }, + { + "epoch": 1.0841836734693877, + "grad_norm": 1.7111074924468994, + "learning_rate": 9.04398447520237e-05, + "loss": 0.1243, + "step": 425 + }, + { + "epoch": 1.086734693877551, + "grad_norm": 1.3238322734832764, + "learning_rate": 9.036624892550343e-05, + "loss": 0.0635, + "step": 426 + }, + { + "epoch": 1.0892857142857142, + "grad_norm": 1.7746036052703857, + "learning_rate": 9.02924187251613e-05, + "loss": 0.0932, + "step": 427 + }, + { + "epoch": 1.0918367346938775, + "grad_norm": 1.107481837272644, + "learning_rate": 9.021835488113108e-05, + "loss": 0.0568, + "step": 428 + }, + { + "epoch": 1.094387755102041, + "grad_norm": 1.5251835584640503, + "learning_rate": 9.014405812585722e-05, + "loss": 0.0716, + "step": 429 + }, + { + "epoch": 1.096938775510204, + "grad_norm": 1.5086864233016968, + "learning_rate": 9.006952919408744e-05, + "loss": 0.0734, + "step": 430 + }, + { + "epoch": 1.0994897959183674, + "grad_norm": 1.458133578300476, + "learning_rate": 8.999476882286553e-05, + "loss": 0.0533, + "step": 431 + }, + { + "epoch": 1.1020408163265305, + "grad_norm": 1.3510674238204956, + "learning_rate": 8.991977775152412e-05, + "loss": 0.0593, + "step": 432 + }, + { + "epoch": 1.1045918367346939, + "grad_norm": 0.8574031591415405, + "learning_rate": 8.984455672167729e-05, + "loss": 0.0198, + "step": 433 + }, + { + "epoch": 1.1071428571428572, + "grad_norm": 1.1454882621765137, + "learning_rate": 8.976910647721326e-05, + "loss": 0.0484, + "step": 434 + }, + { + "epoch": 1.1096938775510203, + "grad_norm": 1.11487877368927, + "learning_rate": 8.969342776428704e-05, + "loss": 0.0656, + "step": 435 + }, + { + "epoch": 1.1122448979591837, + "grad_norm": 0.7230798006057739, + "learning_rate": 8.961752133131309e-05, + "loss": 0.0263, + "step": 436 + }, + { + "epoch": 1.114795918367347, + "grad_norm": 1.4215810298919678, + "learning_rate": 8.954138792895781e-05, + "loss": 0.0753, + "step": 437 + }, + { + "epoch": 1.1173469387755102, + "grad_norm": 0.9879924654960632, + "learning_rate": 8.946502831013219e-05, + "loss": 0.0478, + "step": 438 + }, + { + "epoch": 1.1198979591836735, + "grad_norm": 1.065205693244934, + "learning_rate": 8.938844322998443e-05, + "loss": 0.0375, + "step": 439 + }, + { + "epoch": 1.1224489795918366, + "grad_norm": 1.3906233310699463, + "learning_rate": 8.931163344589232e-05, + "loss": 0.0822, + "step": 440 + }, + { + "epoch": 1.125, + "grad_norm": 1.225745677947998, + "learning_rate": 8.923459971745589e-05, + "loss": 0.0826, + "step": 441 + }, + { + "epoch": 1.1275510204081634, + "grad_norm": 1.5490199327468872, + "learning_rate": 8.915734280648978e-05, + "loss": 0.0906, + "step": 442 + }, + { + "epoch": 1.1301020408163265, + "grad_norm": 0.15318873524665833, + "learning_rate": 8.907986347701575e-05, + "loss": 0.0251, + "step": 443 + }, + { + "epoch": 1.1326530612244898, + "grad_norm": 1.2454787492752075, + "learning_rate": 8.900216249525526e-05, + "loss": 0.034, + "step": 444 + }, + { + "epoch": 1.135204081632653, + "grad_norm": 0.7449340224266052, + "learning_rate": 8.892424062962162e-05, + "loss": 0.0149, + "step": 445 + }, + { + "epoch": 1.1377551020408163, + "grad_norm": 1.3367047309875488, + "learning_rate": 8.884609865071265e-05, + "loss": 0.063, + "step": 446 + }, + { + "epoch": 1.1403061224489797, + "grad_norm": 1.3809350728988647, + "learning_rate": 8.87677373313029e-05, + "loss": 0.0517, + "step": 447 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 1.1919385194778442, + "learning_rate": 8.868915744633606e-05, + "loss": 0.0727, + "step": 448 + }, + { + "epoch": 1.1454081632653061, + "grad_norm": 1.7786552906036377, + "learning_rate": 8.861035977291732e-05, + "loss": 0.0699, + "step": 449 + }, + { + "epoch": 1.1479591836734695, + "grad_norm": 0.8784481287002563, + "learning_rate": 8.853134509030558e-05, + "loss": 0.0464, + "step": 450 + }, + { + "epoch": 1.1505102040816326, + "grad_norm": 2.3667984008789062, + "learning_rate": 8.8452114179906e-05, + "loss": 0.1634, + "step": 451 + }, + { + "epoch": 1.153061224489796, + "grad_norm": 1.4006800651550293, + "learning_rate": 8.837266782526187e-05, + "loss": 0.0829, + "step": 452 + }, + { + "epoch": 1.155612244897959, + "grad_norm": 2.5285255908966064, + "learning_rate": 8.829300681204724e-05, + "loss": 0.2674, + "step": 453 + }, + { + "epoch": 1.1581632653061225, + "grad_norm": 1.1632329225540161, + "learning_rate": 8.821313192805898e-05, + "loss": 0.0963, + "step": 454 + }, + { + "epoch": 1.1607142857142858, + "grad_norm": 0.49350619316101074, + "learning_rate": 8.813304396320896e-05, + "loss": 0.0111, + "step": 455 + }, + { + "epoch": 1.163265306122449, + "grad_norm": 1.3978357315063477, + "learning_rate": 8.805274370951628e-05, + "loss": 0.0748, + "step": 456 + }, + { + "epoch": 1.1658163265306123, + "grad_norm": 1.41861093044281, + "learning_rate": 8.797223196109952e-05, + "loss": 0.0923, + "step": 457 + }, + { + "epoch": 1.1683673469387754, + "grad_norm": 1.0811306238174438, + "learning_rate": 8.789150951416869e-05, + "loss": 0.0283, + "step": 458 + }, + { + "epoch": 1.1709183673469388, + "grad_norm": 0.9831142425537109, + "learning_rate": 8.781057716701759e-05, + "loss": 0.0436, + "step": 459 + }, + { + "epoch": 1.1734693877551021, + "grad_norm": 1.165339469909668, + "learning_rate": 8.772943572001575e-05, + "loss": 0.0331, + "step": 460 + }, + { + "epoch": 1.1760204081632653, + "grad_norm": 1.128321886062622, + "learning_rate": 8.764808597560055e-05, + "loss": 0.0698, + "step": 461 + }, + { + "epoch": 1.1785714285714286, + "grad_norm": 1.302606225013733, + "learning_rate": 8.75665287382693e-05, + "loss": 0.0591, + "step": 462 + }, + { + "epoch": 1.181122448979592, + "grad_norm": 1.17527174949646, + "learning_rate": 8.748476481457131e-05, + "loss": 0.0543, + "step": 463 + }, + { + "epoch": 1.183673469387755, + "grad_norm": 1.4486712217330933, + "learning_rate": 8.740279501309987e-05, + "loss": 0.0914, + "step": 464 + }, + { + "epoch": 1.1862244897959184, + "grad_norm": 1.2726534605026245, + "learning_rate": 8.73206201444843e-05, + "loss": 0.0533, + "step": 465 + }, + { + "epoch": 1.1887755102040816, + "grad_norm": 1.7414158582687378, + "learning_rate": 8.723824102138186e-05, + "loss": 0.1107, + "step": 466 + }, + { + "epoch": 1.191326530612245, + "grad_norm": 1.4504228830337524, + "learning_rate": 8.715565845846976e-05, + "loss": 0.0687, + "step": 467 + }, + { + "epoch": 1.193877551020408, + "grad_norm": 1.321366786956787, + "learning_rate": 8.707287327243713e-05, + "loss": 0.0373, + "step": 468 + }, + { + "epoch": 1.1964285714285714, + "grad_norm": 1.0901155471801758, + "learning_rate": 8.69898862819769e-05, + "loss": 0.0398, + "step": 469 + }, + { + "epoch": 1.1989795918367347, + "grad_norm": 0.8066145181655884, + "learning_rate": 8.690669830777773e-05, + "loss": 0.0373, + "step": 470 + }, + { + "epoch": 1.2015306122448979, + "grad_norm": 1.5451761484146118, + "learning_rate": 8.682331017251586e-05, + "loss": 0.0779, + "step": 471 + }, + { + "epoch": 1.2040816326530612, + "grad_norm": 0.7129878401756287, + "learning_rate": 8.6739722700847e-05, + "loss": 0.0196, + "step": 472 + }, + { + "epoch": 1.2040816326530612, + "eval_NLI_loss": 0.8001277446746826, + "eval_NLI_runtime": 7.1574, + "eval_NLI_samples_per_second": 11.876, + "eval_NLI_steps_per_second": 0.14, + "eval_Qnli-dev_cosine_accuracy": 0.7109375, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.7011003494262695, + "eval_Qnli-dev_cosine_ap": 0.7324635642151244, + "eval_Qnli-dev_cosine_f1": 0.7142857142857142, + "eval_Qnli-dev_cosine_f1_threshold": 0.5691071152687073, + "eval_Qnli-dev_cosine_mcc": 0.4117570324277877, + "eval_Qnli-dev_cosine_precision": 0.6172839506172839, + "eval_Qnli-dev_cosine_recall": 0.847457627118644, + "eval_allNLI-dev_cosine_accuracy": 0.7734375, + "eval_allNLI-dev_cosine_accuracy_threshold": 0.783771812915802, + "eval_allNLI-dev_cosine_ap": 0.6760763516055974, + "eval_allNLI-dev_cosine_f1": 0.7058823529411764, + "eval_allNLI-dev_cosine_f1_threshold": 0.658562183380127, + "eval_allNLI-dev_cosine_mcc": 0.5368906701581015, + "eval_allNLI-dev_cosine_precision": 0.6101694915254238, + "eval_allNLI-dev_cosine_recall": 0.8372093023255814, + "eval_sequential_score": 0.7324635642151244, + "eval_sts-test_pearson_cosine": 0.8993434582092046, + "eval_sts-test_spearman_cosine": 0.9158999012166225, + "step": 472 + }, + { + "epoch": 1.2040816326530612, + "eval_natural-questions_loss": 0.13460519909858704, + "eval_natural-questions_runtime": 42.1765, + "eval_natural-questions_samples_per_second": 2.679, + "eval_natural-questions_steps_per_second": 0.024, + "step": 472 + }, + { + "epoch": 1.2040816326530612, + "eval_vitaminc_loss": 2.3656909465789795, + "eval_vitaminc_runtime": 1.5196, + "eval_vitaminc_samples_per_second": 74.361, + "eval_vitaminc_steps_per_second": 0.658, + "step": 472 + }, + { + "epoch": 1.2040816326530612, + "eval_xsum_loss": 0.07306504994630814, + "eval_xsum_runtime": 7.5999, + "eval_xsum_samples_per_second": 14.869, + "eval_xsum_steps_per_second": 0.132, + "step": 472 + }, + { + "epoch": 1.2040816326530612, + "eval_paws_loss": 0.020180676132440567, + "eval_paws_runtime": 1.3731, + "eval_paws_samples_per_second": 82.298, + "eval_paws_steps_per_second": 0.728, + "step": 472 + }, + { + "epoch": 1.2040816326530612, + "eval_global_dataset_loss": 0.3437402546405792, + "eval_global_dataset_runtime": 15.857, + "eval_global_dataset_samples_per_second": 16.144, + "eval_global_dataset_steps_per_second": 0.063, + "step": 472 + }, + { + "epoch": 1.2066326530612246, + "grad_norm": 1.1227662563323975, + "learning_rate": 8.665593671939818e-05, + "loss": 0.0459, + "step": 473 + }, + { + "epoch": 1.2091836734693877, + "grad_norm": 1.9399704933166504, + "learning_rate": 8.657195305675956e-05, + "loss": 0.0984, + "step": 474 + }, + { + "epoch": 1.211734693877551, + "grad_norm": 0.9964848756790161, + "learning_rate": 8.648777254347624e-05, + "loss": 0.0394, + "step": 475 + }, + { + "epoch": 1.2142857142857142, + "grad_norm": 1.2951750755310059, + "learning_rate": 8.640339601204006e-05, + "loss": 0.0453, + "step": 476 + }, + { + "epoch": 1.2168367346938775, + "grad_norm": 1.8146740198135376, + "learning_rate": 8.631882429688132e-05, + "loss": 0.1034, + "step": 477 + }, + { + "epoch": 1.219387755102041, + "grad_norm": 1.073499321937561, + "learning_rate": 8.623405823436064e-05, + "loss": 0.0432, + "step": 478 + }, + { + "epoch": 1.221938775510204, + "grad_norm": 1.7169091701507568, + "learning_rate": 8.61490986627605e-05, + "loss": 0.1112, + "step": 479 + }, + { + "epoch": 1.2244897959183674, + "grad_norm": 1.206787347793579, + "learning_rate": 8.606394642227717e-05, + "loss": 0.1027, + "step": 480 + }, + { + "epoch": 1.2270408163265305, + "grad_norm": 1.2718307971954346, + "learning_rate": 8.597860235501225e-05, + "loss": 0.0717, + "step": 481 + }, + { + "epoch": 1.2295918367346939, + "grad_norm": 1.3505269289016724, + "learning_rate": 8.589306730496434e-05, + "loss": 0.0858, + "step": 482 + }, + { + "epoch": 1.2321428571428572, + "grad_norm": 1.3387874364852905, + "learning_rate": 8.580734211802079e-05, + "loss": 0.081, + "step": 483 + }, + { + "epoch": 1.2346938775510203, + "grad_norm": 1.0986344814300537, + "learning_rate": 8.57214276419493e-05, + "loss": 0.0342, + "step": 484 + }, + { + "epoch": 1.2372448979591837, + "grad_norm": 0.9662691354751587, + "learning_rate": 8.563532472638947e-05, + "loss": 0.0239, + "step": 485 + }, + { + "epoch": 1.239795918367347, + "grad_norm": 1.5100804567337036, + "learning_rate": 8.554903422284448e-05, + "loss": 0.0657, + "step": 486 + }, + { + "epoch": 1.2423469387755102, + "grad_norm": 1.467030644416809, + "learning_rate": 8.54625569846726e-05, + "loss": 0.0461, + "step": 487 + }, + { + "epoch": 1.2448979591836735, + "grad_norm": 1.4691377878189087, + "learning_rate": 8.537589386707885e-05, + "loss": 0.0738, + "step": 488 + }, + { + "epoch": 1.2474489795918366, + "grad_norm": 2.0533835887908936, + "learning_rate": 8.52890457271064e-05, + "loss": 0.094, + "step": 489 + }, + { + "epoch": 1.25, + "grad_norm": 1.2277331352233887, + "learning_rate": 8.520201342362826e-05, + "loss": 0.0414, + "step": 490 + }, + { + "epoch": 1.2525510204081631, + "grad_norm": 1.0798680782318115, + "learning_rate": 8.511479781733864e-05, + "loss": 0.0376, + "step": 491 + }, + { + "epoch": 1.2551020408163265, + "grad_norm": 1.0401206016540527, + "learning_rate": 8.502739977074447e-05, + "loss": 0.0504, + "step": 492 + }, + { + "epoch": 1.2576530612244898, + "grad_norm": 1.7602112293243408, + "learning_rate": 8.4939820148157e-05, + "loss": 0.0856, + "step": 493 + }, + { + "epoch": 1.260204081632653, + "grad_norm": 2.4652020931243896, + "learning_rate": 8.485205981568307e-05, + "loss": 0.134, + "step": 494 + }, + { + "epoch": 1.2627551020408163, + "grad_norm": 1.277016520500183, + "learning_rate": 8.476411964121667e-05, + "loss": 0.0591, + "step": 495 + }, + { + "epoch": 1.2653061224489797, + "grad_norm": 1.302207350730896, + "learning_rate": 8.467600049443025e-05, + "loss": 0.0569, + "step": 496 + }, + { + "epoch": 1.2678571428571428, + "grad_norm": 1.4229450225830078, + "learning_rate": 8.458770324676626e-05, + "loss": 0.099, + "step": 497 + }, + { + "epoch": 1.2704081632653061, + "grad_norm": 1.150447964668274, + "learning_rate": 8.44992287714284e-05, + "loss": 0.0343, + "step": 498 + }, + { + "epoch": 1.2729591836734695, + "grad_norm": 1.1253153085708618, + "learning_rate": 8.441057794337308e-05, + "loss": 0.0324, + "step": 499 + }, + { + "epoch": 1.2755102040816326, + "grad_norm": 1.093632698059082, + "learning_rate": 8.43217516393007e-05, + "loss": 0.0709, + "step": 500 + }, + { + "epoch": 1.278061224489796, + "grad_norm": 1.2758386135101318, + "learning_rate": 8.4232750737647e-05, + "loss": 0.0449, + "step": 501 + }, + { + "epoch": 1.280612244897959, + "grad_norm": 1.906056523323059, + "learning_rate": 8.41435761185744e-05, + "loss": 0.0565, + "step": 502 + }, + { + "epoch": 1.2831632653061225, + "grad_norm": 2.139589548110962, + "learning_rate": 8.405422866396326e-05, + "loss": 0.1233, + "step": 503 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 2.5513923168182373, + "learning_rate": 8.396470925740321e-05, + "loss": 0.3064, + "step": 504 + }, + { + "epoch": 1.288265306122449, + "grad_norm": 0.4013485908508301, + "learning_rate": 8.387501878418431e-05, + "loss": 0.0274, + "step": 505 + }, + { + "epoch": 1.2908163265306123, + "grad_norm": 2.8001515865325928, + "learning_rate": 8.37851581312884e-05, + "loss": 0.1614, + "step": 506 + }, + { + "epoch": 1.2933673469387754, + "grad_norm": 0.7105947732925415, + "learning_rate": 8.36951281873803e-05, + "loss": 0.022, + "step": 507 + }, + { + "epoch": 1.2959183673469388, + "grad_norm": 0.8565807938575745, + "learning_rate": 8.3604929842799e-05, + "loss": 0.0268, + "step": 508 + }, + { + "epoch": 1.2984693877551021, + "grad_norm": 1.2848799228668213, + "learning_rate": 8.351456398954882e-05, + "loss": 0.0392, + "step": 509 + }, + { + "epoch": 1.3010204081632653, + "grad_norm": 1.6219931840896606, + "learning_rate": 8.342403152129075e-05, + "loss": 0.0767, + "step": 510 + }, + { + "epoch": 1.3035714285714286, + "grad_norm": 1.0111511945724487, + "learning_rate": 8.333333333333333e-05, + "loss": 0.0479, + "step": 511 + }, + { + "epoch": 1.306122448979592, + "grad_norm": 1.109157681465149, + "learning_rate": 8.324247032262415e-05, + "loss": 0.0307, + "step": 512 + }, + { + "epoch": 1.308673469387755, + "grad_norm": 1.5236334800720215, + "learning_rate": 8.315144338774063e-05, + "loss": 0.0754, + "step": 513 + }, + { + "epoch": 1.3112244897959184, + "grad_norm": 1.2130945920944214, + "learning_rate": 8.306025342888139e-05, + "loss": 0.0554, + "step": 514 + }, + { + "epoch": 1.3137755102040816, + "grad_norm": 1.5156704187393188, + "learning_rate": 8.296890134785724e-05, + "loss": 0.0701, + "step": 515 + }, + { + "epoch": 1.316326530612245, + "grad_norm": 1.3881676197052002, + "learning_rate": 8.287738804808223e-05, + "loss": 0.0794, + "step": 516 + }, + { + "epoch": 1.318877551020408, + "grad_norm": 1.2554371356964111, + "learning_rate": 8.278571443456483e-05, + "loss": 0.0608, + "step": 517 + }, + { + "epoch": 1.3214285714285714, + "grad_norm": 1.1399636268615723, + "learning_rate": 8.269388141389883e-05, + "loss": 0.029, + "step": 518 + }, + { + "epoch": 1.3239795918367347, + "grad_norm": 1.1398260593414307, + "learning_rate": 8.260188989425455e-05, + "loss": 0.0302, + "step": 519 + }, + { + "epoch": 1.3265306122448979, + "grad_norm": 1.1707773208618164, + "learning_rate": 8.250974078536967e-05, + "loss": 0.0446, + "step": 520 + }, + { + "epoch": 1.3290816326530612, + "grad_norm": 1.8493636846542358, + "learning_rate": 8.24174349985404e-05, + "loss": 0.14, + "step": 521 + }, + { + "epoch": 1.3316326530612246, + "grad_norm": 0.8552718758583069, + "learning_rate": 8.232497344661236e-05, + "loss": 0.0238, + "step": 522 + }, + { + "epoch": 1.3341836734693877, + "grad_norm": 1.3908804655075073, + "learning_rate": 8.223235704397161e-05, + "loss": 0.0588, + "step": 523 + }, + { + "epoch": 1.336734693877551, + "grad_norm": 2.0493106842041016, + "learning_rate": 8.213958670653555e-05, + "loss": 0.147, + "step": 524 + }, + { + "epoch": 1.3392857142857144, + "grad_norm": 0.8251523971557617, + "learning_rate": 8.204666335174392e-05, + "loss": 0.0354, + "step": 525 + }, + { + "epoch": 1.3418367346938775, + "grad_norm": 0.88651442527771, + "learning_rate": 8.19535878985497e-05, + "loss": 0.0304, + "step": 526 + }, + { + "epoch": 1.344387755102041, + "grad_norm": 1.3557745218276978, + "learning_rate": 8.186036126741005e-05, + "loss": 0.0651, + "step": 527 + }, + { + "epoch": 1.346938775510204, + "grad_norm": 1.958229422569275, + "learning_rate": 8.176698438027715e-05, + "loss": 0.1694, + "step": 528 + }, + { + "epoch": 1.3494897959183674, + "grad_norm": 1.585067629814148, + "learning_rate": 8.167345816058911e-05, + "loss": 0.0912, + "step": 529 + }, + { + "epoch": 1.3520408163265305, + "grad_norm": 1.1764601469039917, + "learning_rate": 8.15797835332609e-05, + "loss": 0.0501, + "step": 530 + }, + { + "epoch": 1.3545918367346939, + "grad_norm": 1.0664958953857422, + "learning_rate": 8.14859614246751e-05, + "loss": 0.0729, + "step": 531 + }, + { + "epoch": 1.3545918367346939, + "eval_NLI_loss": 0.7907916903495789, + "eval_NLI_runtime": 7.1619, + "eval_NLI_samples_per_second": 11.868, + "eval_NLI_steps_per_second": 0.14, + "eval_Qnli-dev_cosine_accuracy": 0.71875, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.6904141902923584, + "eval_Qnli-dev_cosine_ap": 0.7358263807652359, + "eval_Qnli-dev_cosine_f1": 0.7132867132867132, + "eval_Qnli-dev_cosine_f1_threshold": 0.545956015586853, + "eval_Qnli-dev_cosine_mcc": 0.4052621174492469, + "eval_Qnli-dev_cosine_precision": 0.6071428571428571, + "eval_Qnli-dev_cosine_recall": 0.864406779661017, + "eval_allNLI-dev_cosine_accuracy": 0.78125, + "eval_allNLI-dev_cosine_accuracy_threshold": 0.6931483745574951, + "eval_allNLI-dev_cosine_ap": 0.671359766550438, + "eval_allNLI-dev_cosine_f1": 0.7254901960784315, + "eval_allNLI-dev_cosine_f1_threshold": 0.6645922064781189, + "eval_allNLI-dev_cosine_mcc": 0.5700736763291478, + "eval_allNLI-dev_cosine_precision": 0.6271186440677966, + "eval_allNLI-dev_cosine_recall": 0.8604651162790697, + "eval_sequential_score": 0.7358263807652359, + "eval_sts-test_pearson_cosine": 0.9024030078118572, + "eval_sts-test_spearman_cosine": 0.9156696559767638, + "step": 531 + }, + { + "epoch": 1.3545918367346939, + "eval_natural-questions_loss": 0.12502792477607727, + "eval_natural-questions_runtime": 42.1947, + "eval_natural-questions_samples_per_second": 2.678, + "eval_natural-questions_steps_per_second": 0.024, + "step": 531 + }, + { + "epoch": 1.3545918367346939, + "eval_vitaminc_loss": 2.3196637630462646, + "eval_vitaminc_runtime": 1.5118, + "eval_vitaminc_samples_per_second": 74.747, + "eval_vitaminc_steps_per_second": 0.661, + "step": 531 + }, + { + "epoch": 1.3545918367346939, + "eval_xsum_loss": 0.10098682343959808, + "eval_xsum_runtime": 7.5925, + "eval_xsum_samples_per_second": 14.883, + "eval_xsum_steps_per_second": 0.132, + "step": 531 + }, + { + "epoch": 1.3545918367346939, + "eval_paws_loss": 0.020537061616778374, + "eval_paws_runtime": 1.372, + "eval_paws_samples_per_second": 82.364, + "eval_paws_steps_per_second": 0.729, + "step": 531 + }, + { + "epoch": 1.3545918367346939, + "eval_global_dataset_loss": 0.27962425351142883, + "eval_global_dataset_runtime": 15.8593, + "eval_global_dataset_samples_per_second": 16.142, + "eval_global_dataset_steps_per_second": 0.063, + "step": 531 + }, + { + "epoch": 1.3571428571428572, + "grad_norm": 0.8320642113685608, + "learning_rate": 8.139199276267278e-05, + "loss": 0.0273, + "step": 532 + }, + { + "epoch": 1.3596938775510203, + "grad_norm": 1.2157062292099, + "learning_rate": 8.129787847654435e-05, + "loss": 0.0709, + "step": 533 + }, + { + "epoch": 1.3622448979591837, + "grad_norm": 1.4499181509017944, + "learning_rate": 8.120361949702037e-05, + "loss": 0.0852, + "step": 534 + }, + { + "epoch": 1.364795918367347, + "grad_norm": 1.3032598495483398, + "learning_rate": 8.11092167562623e-05, + "loss": 0.0669, + "step": 535 + }, + { + "epoch": 1.3673469387755102, + "grad_norm": 1.4867783784866333, + "learning_rate": 8.101467118785328e-05, + "loss": 0.0446, + "step": 536 + }, + { + "epoch": 1.3698979591836735, + "grad_norm": 0.8226636648178101, + "learning_rate": 8.091998372678898e-05, + "loss": 0.0206, + "step": 537 + }, + { + "epoch": 1.3724489795918369, + "grad_norm": 0.9153483510017395, + "learning_rate": 8.082515530946827e-05, + "loss": 0.0348, + "step": 538 + }, + { + "epoch": 1.375, + "grad_norm": 0.5238258838653564, + "learning_rate": 8.073018687368399e-05, + "loss": 0.0138, + "step": 539 + }, + { + "epoch": 1.3775510204081631, + "grad_norm": 1.3145301342010498, + "learning_rate": 8.063507935861368e-05, + "loss": 0.0769, + "step": 540 + }, + { + "epoch": 1.3801020408163265, + "grad_norm": 1.43368661403656, + "learning_rate": 8.053983370481026e-05, + "loss": 0.0778, + "step": 541 + }, + { + "epoch": 1.3826530612244898, + "grad_norm": 0.8249632120132446, + "learning_rate": 8.044445085419281e-05, + "loss": 0.0293, + "step": 542 + }, + { + "epoch": 1.385204081632653, + "grad_norm": 0.9206202626228333, + "learning_rate": 8.034893175003713e-05, + "loss": 0.0295, + "step": 543 + }, + { + "epoch": 1.3877551020408163, + "grad_norm": 0.7518428564071655, + "learning_rate": 8.025327733696655e-05, + "loss": 0.0235, + "step": 544 + }, + { + "epoch": 1.3903061224489797, + "grad_norm": 1.3299691677093506, + "learning_rate": 8.015748856094246e-05, + "loss": 0.0542, + "step": 545 + }, + { + "epoch": 1.3928571428571428, + "grad_norm": 1.4092249870300293, + "learning_rate": 8.006156636925505e-05, + "loss": 0.0552, + "step": 546 + }, + { + "epoch": 1.3954081632653061, + "grad_norm": 1.204359531402588, + "learning_rate": 7.996551171051388e-05, + "loss": 0.0502, + "step": 547 + }, + { + "epoch": 1.3979591836734695, + "grad_norm": 1.4326294660568237, + "learning_rate": 7.986932553463857e-05, + "loss": 0.0514, + "step": 548 + }, + { + "epoch": 1.4005102040816326, + "grad_norm": 1.8715698719024658, + "learning_rate": 7.97730087928493e-05, + "loss": 0.0981, + "step": 549 + }, + { + "epoch": 1.403061224489796, + "grad_norm": 1.2275328636169434, + "learning_rate": 7.967656243765754e-05, + "loss": 0.0514, + "step": 550 + }, + { + "epoch": 1.405612244897959, + "grad_norm": 0.9787119626998901, + "learning_rate": 7.957998742285644e-05, + "loss": 0.0382, + "step": 551 + }, + { + "epoch": 1.4081632653061225, + "grad_norm": 2.1592862606048584, + "learning_rate": 7.948328470351165e-05, + "loss": 0.1352, + "step": 552 + }, + { + "epoch": 1.4107142857142856, + "grad_norm": 1.1729885339736938, + "learning_rate": 7.938645523595163e-05, + "loss": 0.0379, + "step": 553 + }, + { + "epoch": 1.413265306122449, + "grad_norm": 1.0441006422042847, + "learning_rate": 7.928949997775834e-05, + "loss": 0.0542, + "step": 554 + }, + { + "epoch": 1.4158163265306123, + "grad_norm": 1.0458109378814697, + "learning_rate": 7.919241988775774e-05, + "loss": 0.026, + "step": 555 + }, + { + "epoch": 1.4183673469387754, + "grad_norm": 2.026048421859741, + "learning_rate": 7.909521592601031e-05, + "loss": 0.1222, + "step": 556 + }, + { + "epoch": 1.4209183673469388, + "grad_norm": 1.4532408714294434, + "learning_rate": 7.899788905380146e-05, + "loss": 0.059, + "step": 557 + }, + { + "epoch": 1.4234693877551021, + "grad_norm": 1.7006900310516357, + "learning_rate": 7.890044023363219e-05, + "loss": 0.0621, + "step": 558 + }, + { + "epoch": 1.4260204081632653, + "grad_norm": 1.5406718254089355, + "learning_rate": 7.880287042920946e-05, + "loss": 0.1069, + "step": 559 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 1.6419098377227783, + "learning_rate": 7.87051806054367e-05, + "loss": 0.0804, + "step": 560 + }, + { + "epoch": 1.431122448979592, + "grad_norm": 1.6258991956710815, + "learning_rate": 7.860737172840424e-05, + "loss": 0.0754, + "step": 561 + }, + { + "epoch": 1.433673469387755, + "grad_norm": 0.8665074706077576, + "learning_rate": 7.85094447653798e-05, + "loss": 0.0225, + "step": 562 + }, + { + "epoch": 1.4362244897959184, + "grad_norm": 0.7240162491798401, + "learning_rate": 7.841140068479882e-05, + "loss": 0.0243, + "step": 563 + }, + { + "epoch": 1.4387755102040816, + "grad_norm": 1.7290624380111694, + "learning_rate": 7.831324045625503e-05, + "loss": 0.0749, + "step": 564 + }, + { + "epoch": 1.441326530612245, + "grad_norm": 0.632347822189331, + "learning_rate": 7.82149650504908e-05, + "loss": 0.0161, + "step": 565 + }, + { + "epoch": 1.443877551020408, + "grad_norm": 1.123807430267334, + "learning_rate": 7.811657543938745e-05, + "loss": 0.0461, + "step": 566 + }, + { + "epoch": 1.4464285714285714, + "grad_norm": 1.5698630809783936, + "learning_rate": 7.801807259595579e-05, + "loss": 0.0736, + "step": 567 + }, + { + "epoch": 1.4489795918367347, + "grad_norm": 1.1025331020355225, + "learning_rate": 7.791945749432644e-05, + "loss": 0.076, + "step": 568 + }, + { + "epoch": 1.4515306122448979, + "grad_norm": 1.5680058002471924, + "learning_rate": 7.782073110974006e-05, + "loss": 0.0904, + "step": 569 + }, + { + "epoch": 1.4540816326530612, + "grad_norm": 1.2816041707992554, + "learning_rate": 7.772189441853798e-05, + "loss": 0.0704, + "step": 570 + }, + { + "epoch": 1.4566326530612246, + "grad_norm": 1.4532057046890259, + "learning_rate": 7.762294839815231e-05, + "loss": 0.0469, + "step": 571 + }, + { + "epoch": 1.4591836734693877, + "grad_norm": 1.2575308084487915, + "learning_rate": 7.752389402709637e-05, + "loss": 0.0355, + "step": 572 + }, + { + "epoch": 1.461734693877551, + "grad_norm": 1.221437931060791, + "learning_rate": 7.742473228495499e-05, + "loss": 0.0398, + "step": 573 + }, + { + "epoch": 1.4642857142857144, + "grad_norm": 2.4460315704345703, + "learning_rate": 7.732546415237484e-05, + "loss": 0.136, + "step": 574 + }, + { + "epoch": 1.4668367346938775, + "grad_norm": 1.523794174194336, + "learning_rate": 7.722609061105477e-05, + "loss": 0.0562, + "step": 575 + }, + { + "epoch": 1.469387755102041, + "grad_norm": 2.08244252204895, + "learning_rate": 7.712661264373596e-05, + "loss": 0.0971, + "step": 576 + }, + { + "epoch": 1.471938775510204, + "grad_norm": 1.4276880025863647, + "learning_rate": 7.702703123419238e-05, + "loss": 0.0626, + "step": 577 + }, + { + "epoch": 1.4744897959183674, + "grad_norm": 1.9193519353866577, + "learning_rate": 7.692734736722094e-05, + "loss": 0.0654, + "step": 578 + }, + { + "epoch": 1.4770408163265305, + "grad_norm": 2.1438186168670654, + "learning_rate": 7.682756202863178e-05, + "loss": 0.3237, + "step": 579 + }, + { + "epoch": 1.4795918367346939, + "grad_norm": 0.8090245127677917, + "learning_rate": 7.672767620523857e-05, + "loss": 0.0122, + "step": 580 + }, + { + "epoch": 1.4821428571428572, + "grad_norm": 1.0718419551849365, + "learning_rate": 7.662769088484866e-05, + "loss": 0.0237, + "step": 581 + }, + { + "epoch": 1.4846938775510203, + "grad_norm": 1.350030779838562, + "learning_rate": 7.652760705625343e-05, + "loss": 0.0606, + "step": 582 + }, + { + "epoch": 1.4872448979591837, + "grad_norm": 1.8158615827560425, + "learning_rate": 7.642742570921834e-05, + "loss": 0.1069, + "step": 583 + }, + { + "epoch": 1.489795918367347, + "grad_norm": 2.003227949142456, + "learning_rate": 7.632714783447338e-05, + "loss": 0.139, + "step": 584 + }, + { + "epoch": 1.4923469387755102, + "grad_norm": 1.248184323310852, + "learning_rate": 7.622677442370302e-05, + "loss": 0.0433, + "step": 585 + }, + { + "epoch": 1.4948979591836735, + "grad_norm": 1.4494481086730957, + "learning_rate": 7.612630646953658e-05, + "loss": 0.0633, + "step": 586 + }, + { + "epoch": 1.4974489795918369, + "grad_norm": 2.7592310905456543, + "learning_rate": 7.602574496553833e-05, + "loss": 0.499, + "step": 587 + }, + { + "epoch": 1.5, + "grad_norm": 1.520603895187378, + "learning_rate": 7.592509090619769e-05, + "loss": 0.0479, + "step": 588 + }, + { + "epoch": 1.5025510204081631, + "grad_norm": 1.4098992347717285, + "learning_rate": 7.582434528691944e-05, + "loss": 0.0586, + "step": 589 + }, + { + "epoch": 1.5051020408163265, + "grad_norm": 1.2117429971694946, + "learning_rate": 7.572350910401377e-05, + "loss": 0.047, + "step": 590 + }, + { + "epoch": 1.5051020408163265, + "eval_NLI_loss": 0.8208035826683044, + "eval_NLI_runtime": 7.1322, + "eval_NLI_samples_per_second": 11.918, + "eval_NLI_steps_per_second": 0.14, + "eval_Qnli-dev_cosine_accuracy": 0.734375, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.6462156772613525, + "eval_Qnli-dev_cosine_ap": 0.7268076865173757, + "eval_Qnli-dev_cosine_f1": 0.7183098591549295, + "eval_Qnli-dev_cosine_f1_threshold": 0.5285258293151855, + "eval_Qnli-dev_cosine_mcc": 0.4182713391161482, + "eval_Qnli-dev_cosine_precision": 0.6144578313253012, + "eval_Qnli-dev_cosine_recall": 0.864406779661017, + "eval_allNLI-dev_cosine_accuracy": 0.7578125, + "eval_allNLI-dev_cosine_accuracy_threshold": 0.6917041540145874, + "eval_allNLI-dev_cosine_ap": 0.6575783916365846, + "eval_allNLI-dev_cosine_f1": 0.6990291262135923, + "eval_allNLI-dev_cosine_f1_threshold": 0.6497839689254761, + "eval_allNLI-dev_cosine_mcc": 0.5251630700083426, + "eval_allNLI-dev_cosine_precision": 0.6, + "eval_allNLI-dev_cosine_recall": 0.8372093023255814, + "eval_sequential_score": 0.7268076865173757, + "eval_sts-test_pearson_cosine": 0.9050361208497995, + "eval_sts-test_spearman_cosine": 0.9183510173055522, + "step": 590 + }, + { + "epoch": 1.5051020408163265, + "eval_natural-questions_loss": 0.14143633842468262, + "eval_natural-questions_runtime": 42.1608, + "eval_natural-questions_samples_per_second": 2.68, + "eval_natural-questions_steps_per_second": 0.024, + "step": 590 + }, + { + "epoch": 1.5051020408163265, + "eval_vitaminc_loss": 2.380038261413574, + "eval_vitaminc_runtime": 1.5031, + "eval_vitaminc_samples_per_second": 75.178, + "eval_vitaminc_steps_per_second": 0.665, + "step": 590 + }, + { + "epoch": 1.5051020408163265, + "eval_xsum_loss": 0.08793248236179352, + "eval_xsum_runtime": 7.582, + "eval_xsum_samples_per_second": 14.904, + "eval_xsum_steps_per_second": 0.132, + "step": 590 + }, + { + "epoch": 1.5051020408163265, + "eval_paws_loss": 0.02044089324772358, + "eval_paws_runtime": 1.3708, + "eval_paws_samples_per_second": 82.435, + "eval_paws_steps_per_second": 0.73, + "step": 590 + }, + { + "epoch": 1.5051020408163265, + "eval_global_dataset_loss": 0.31436362862586975, + "eval_global_dataset_runtime": 15.8245, + "eval_global_dataset_samples_per_second": 16.177, + "eval_global_dataset_steps_per_second": 0.063, + "step": 590 + }, + { + "epoch": 1.5076530612244898, + "grad_norm": 1.671633005142212, + "learning_rate": 7.56225833546865e-05, + "loss": 0.071, + "step": 591 + }, + { + "epoch": 1.510204081632653, + "grad_norm": 1.3503303527832031, + "learning_rate": 7.552156903702922e-05, + "loss": 0.0954, + "step": 592 + }, + { + "epoch": 1.5127551020408163, + "grad_norm": 0.9186678528785706, + "learning_rate": 7.542046715000941e-05, + "loss": 0.0224, + "step": 593 + }, + { + "epoch": 1.5153061224489797, + "grad_norm": 0.9596667289733887, + "learning_rate": 7.531927869346052e-05, + "loss": 0.0772, + "step": 594 + }, + { + "epoch": 1.5178571428571428, + "grad_norm": 1.177789330482483, + "learning_rate": 7.521800466807219e-05, + "loss": 0.0385, + "step": 595 + }, + { + "epoch": 1.5204081632653061, + "grad_norm": 1.5873372554779053, + "learning_rate": 7.511664607538017e-05, + "loss": 0.054, + "step": 596 + }, + { + "epoch": 1.5229591836734695, + "grad_norm": 1.70867919921875, + "learning_rate": 7.501520391775662e-05, + "loss": 0.0575, + "step": 597 + }, + { + "epoch": 1.5255102040816326, + "grad_norm": 1.6293933391571045, + "learning_rate": 7.491367919840008e-05, + "loss": 0.0843, + "step": 598 + }, + { + "epoch": 1.5280612244897958, + "grad_norm": 1.430991291999817, + "learning_rate": 7.481207292132557e-05, + "loss": 0.0619, + "step": 599 + }, + { + "epoch": 1.5306122448979593, + "grad_norm": 1.769045114517212, + "learning_rate": 7.471038609135464e-05, + "loss": 0.1038, + "step": 600 + }, + { + "epoch": 1.5331632653061225, + "grad_norm": 1.2034945487976074, + "learning_rate": 7.460861971410551e-05, + "loss": 0.0363, + "step": 601 + }, + { + "epoch": 1.5357142857142856, + "grad_norm": 1.2666120529174805, + "learning_rate": 7.450677479598301e-05, + "loss": 0.0509, + "step": 602 + }, + { + "epoch": 1.538265306122449, + "grad_norm": 1.9402971267700195, + "learning_rate": 7.440485234416872e-05, + "loss": 0.1219, + "step": 603 + }, + { + "epoch": 1.5408163265306123, + "grad_norm": 0.6082718372344971, + "learning_rate": 7.4302853366611e-05, + "loss": 0.0381, + "step": 604 + }, + { + "epoch": 1.5433673469387754, + "grad_norm": 0.972944438457489, + "learning_rate": 7.420077887201498e-05, + "loss": 0.0303, + "step": 605 + }, + { + "epoch": 1.5459183673469388, + "grad_norm": 1.2257813215255737, + "learning_rate": 7.409862986983258e-05, + "loss": 0.04, + "step": 606 + }, + { + "epoch": 1.5484693877551021, + "grad_norm": 1.685649037361145, + "learning_rate": 7.399640737025259e-05, + "loss": 0.0718, + "step": 607 + }, + { + "epoch": 1.5510204081632653, + "grad_norm": 1.2359188795089722, + "learning_rate": 7.389411238419065e-05, + "loss": 0.048, + "step": 608 + }, + { + "epoch": 1.5535714285714286, + "grad_norm": 1.400841236114502, + "learning_rate": 7.37917459232792e-05, + "loss": 0.0963, + "step": 609 + }, + { + "epoch": 1.556122448979592, + "grad_norm": 1.3662970066070557, + "learning_rate": 7.368930899985755e-05, + "loss": 0.0606, + "step": 610 + }, + { + "epoch": 1.558673469387755, + "grad_norm": 1.7486293315887451, + "learning_rate": 7.358680262696185e-05, + "loss": 0.0801, + "step": 611 + }, + { + "epoch": 1.5612244897959182, + "grad_norm": 1.5987374782562256, + "learning_rate": 7.348422781831504e-05, + "loss": 0.1044, + "step": 612 + }, + { + "epoch": 1.5637755102040818, + "grad_norm": 1.56114661693573, + "learning_rate": 7.338158558831684e-05, + "loss": 0.0713, + "step": 613 + }, + { + "epoch": 1.566326530612245, + "grad_norm": 1.8401665687561035, + "learning_rate": 7.327887695203378e-05, + "loss": 0.0746, + "step": 614 + }, + { + "epoch": 1.568877551020408, + "grad_norm": 1.6297073364257812, + "learning_rate": 7.317610292518901e-05, + "loss": 0.1045, + "step": 615 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 1.3684359788894653, + "learning_rate": 7.307326452415244e-05, + "loss": 0.0897, + "step": 616 + }, + { + "epoch": 1.5739795918367347, + "grad_norm": 1.232054591178894, + "learning_rate": 7.297036276593055e-05, + "loss": 0.0447, + "step": 617 + }, + { + "epoch": 1.5765306122448979, + "grad_norm": 1.2386372089385986, + "learning_rate": 7.286739866815641e-05, + "loss": 0.0715, + "step": 618 + }, + { + "epoch": 1.5790816326530612, + "grad_norm": 1.066245436668396, + "learning_rate": 7.276437324907956e-05, + "loss": 0.0285, + "step": 619 + }, + { + "epoch": 1.5816326530612246, + "grad_norm": 1.2142025232315063, + "learning_rate": 7.2661287527556e-05, + "loss": 0.0387, + "step": 620 + }, + { + "epoch": 1.5841836734693877, + "grad_norm": 1.841284990310669, + "learning_rate": 7.255814252303807e-05, + "loss": 0.1302, + "step": 621 + }, + { + "epoch": 1.586734693877551, + "grad_norm": 1.4077726602554321, + "learning_rate": 7.245493925556435e-05, + "loss": 0.0461, + "step": 622 + }, + { + "epoch": 1.5892857142857144, + "grad_norm": 1.3630093336105347, + "learning_rate": 7.235167874574967e-05, + "loss": 0.0674, + "step": 623 + }, + { + "epoch": 1.5918367346938775, + "grad_norm": 1.0913760662078857, + "learning_rate": 7.224836201477488e-05, + "loss": 0.0401, + "step": 624 + }, + { + "epoch": 1.5943877551020407, + "grad_norm": 1.4286442995071411, + "learning_rate": 7.214499008437688e-05, + "loss": 0.0609, + "step": 625 + }, + { + "epoch": 1.5969387755102042, + "grad_norm": 1.4359664916992188, + "learning_rate": 7.204156397683843e-05, + "loss": 0.0883, + "step": 626 + }, + { + "epoch": 1.5994897959183674, + "grad_norm": 1.687910795211792, + "learning_rate": 7.193808471497804e-05, + "loss": 0.1179, + "step": 627 + }, + { + "epoch": 1.6020408163265305, + "grad_norm": 1.3069164752960205, + "learning_rate": 7.183455332213993e-05, + "loss": 0.068, + "step": 628 + }, + { + "epoch": 1.6045918367346939, + "grad_norm": 1.3371392488479614, + "learning_rate": 7.173097082218388e-05, + "loss": 0.0375, + "step": 629 + }, + { + "epoch": 1.6071428571428572, + "grad_norm": 1.1960088014602661, + "learning_rate": 7.162733823947497e-05, + "loss": 0.0626, + "step": 630 + }, + { + "epoch": 1.6096938775510203, + "grad_norm": 1.0463345050811768, + "learning_rate": 7.152365659887373e-05, + "loss": 0.0419, + "step": 631 + }, + { + "epoch": 1.6122448979591837, + "grad_norm": 1.2048102617263794, + "learning_rate": 7.14199269257257e-05, + "loss": 0.0574, + "step": 632 + }, + { + "epoch": 1.614795918367347, + "grad_norm": 1.1888736486434937, + "learning_rate": 7.131615024585152e-05, + "loss": 0.0463, + "step": 633 + }, + { + "epoch": 1.6173469387755102, + "grad_norm": 1.4442514181137085, + "learning_rate": 7.121232758553668e-05, + "loss": 0.0538, + "step": 634 + }, + { + "epoch": 1.6198979591836735, + "grad_norm": 1.5731748342514038, + "learning_rate": 7.110845997152133e-05, + "loss": 0.1048, + "step": 635 + }, + { + "epoch": 1.6224489795918369, + "grad_norm": 1.279299020767212, + "learning_rate": 7.100454843099026e-05, + "loss": 0.0416, + "step": 636 + }, + { + "epoch": 1.625, + "grad_norm": 0.7908463478088379, + "learning_rate": 7.090059399156263e-05, + "loss": 0.0183, + "step": 637 + }, + { + "epoch": 1.6275510204081631, + "grad_norm": 0.9992243647575378, + "learning_rate": 7.079659768128184e-05, + "loss": 0.0314, + "step": 638 + }, + { + "epoch": 1.6301020408163265, + "grad_norm": 0.9718545079231262, + "learning_rate": 7.069256052860538e-05, + "loss": 0.0215, + "step": 639 + }, + { + "epoch": 1.6326530612244898, + "grad_norm": 0.5607759952545166, + "learning_rate": 7.058848356239463e-05, + "loss": 0.0097, + "step": 640 + }, + { + "epoch": 1.635204081632653, + "grad_norm": 0.7420765161514282, + "learning_rate": 7.048436781190471e-05, + "loss": 0.0294, + "step": 641 + }, + { + "epoch": 1.6377551020408163, + "grad_norm": 1.3624439239501953, + "learning_rate": 7.038021430677429e-05, + "loss": 0.0497, + "step": 642 + }, + { + "epoch": 1.6403061224489797, + "grad_norm": 1.5086288452148438, + "learning_rate": 7.02760240770154e-05, + "loss": 0.0709, + "step": 643 + }, + { + "epoch": 1.6428571428571428, + "grad_norm": 1.0940626859664917, + "learning_rate": 7.017179815300327e-05, + "loss": 0.0311, + "step": 644 + }, + { + "epoch": 1.6454081632653061, + "grad_norm": 1.6821752786636353, + "learning_rate": 7.006753756546615e-05, + "loss": 0.0536, + "step": 645 + }, + { + "epoch": 1.6479591836734695, + "grad_norm": 1.1772501468658447, + "learning_rate": 6.9963243345475e-05, + "loss": 0.0485, + "step": 646 + }, + { + "epoch": 1.6505102040816326, + "grad_norm": 1.3659448623657227, + "learning_rate": 6.985891652443348e-05, + "loss": 0.0653, + "step": 647 + }, + { + "epoch": 1.6530612244897958, + "grad_norm": 0.852611780166626, + "learning_rate": 6.975455813406759e-05, + "loss": 0.0309, + "step": 648 + }, + { + "epoch": 1.6556122448979593, + "grad_norm": 1.200761318206787, + "learning_rate": 6.965016920641556e-05, + "loss": 0.0409, + "step": 649 + }, + { + "epoch": 1.6556122448979593, + "eval_NLI_loss": 0.8265447020530701, + "eval_NLI_runtime": 7.1557, + "eval_NLI_samples_per_second": 11.879, + "eval_NLI_steps_per_second": 0.14, + "eval_Qnli-dev_cosine_accuracy": 0.734375, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.6517593860626221, + "eval_Qnli-dev_cosine_ap": 0.743077511951046, + "eval_Qnli-dev_cosine_f1": 0.7299270072992702, + "eval_Qnli-dev_cosine_f1_threshold": 0.5650049448013306, + "eval_Qnli-dev_cosine_mcc": 0.4512392125718268, + "eval_Qnli-dev_cosine_precision": 0.6410256410256411, + "eval_Qnli-dev_cosine_recall": 0.847457627118644, + "eval_allNLI-dev_cosine_accuracy": 0.7578125, + "eval_allNLI-dev_cosine_accuracy_threshold": 0.6773658990859985, + "eval_allNLI-dev_cosine_ap": 0.6598534883594469, + "eval_allNLI-dev_cosine_f1": 0.7047619047619047, + "eval_allNLI-dev_cosine_f1_threshold": 0.6353235244750977, + "eval_allNLI-dev_cosine_mcc": 0.5352526699107679, + "eval_allNLI-dev_cosine_precision": 0.5967741935483871, + "eval_allNLI-dev_cosine_recall": 0.8604651162790697, + "eval_sequential_score": 0.743077511951046, + "eval_sts-test_pearson_cosine": 0.9066359145491176, + "eval_sts-test_spearman_cosine": 0.9193507604850361, + "step": 649 + }, + { + "epoch": 1.6556122448979593, + "eval_natural-questions_loss": 0.1231827586889267, + "eval_natural-questions_runtime": 42.1875, + "eval_natural-questions_samples_per_second": 2.679, + "eval_natural-questions_steps_per_second": 0.024, + "step": 649 + }, + { + "epoch": 1.6556122448979593, + "eval_vitaminc_loss": 2.4417307376861572, + "eval_vitaminc_runtime": 1.5023, + "eval_vitaminc_samples_per_second": 75.218, + "eval_vitaminc_steps_per_second": 0.666, + "step": 649 + }, + { + "epoch": 1.6556122448979593, + "eval_xsum_loss": 0.08192133158445358, + "eval_xsum_runtime": 7.5857, + "eval_xsum_samples_per_second": 14.896, + "eval_xsum_steps_per_second": 0.132, + "step": 649 + }, + { + "epoch": 1.6556122448979593, + "eval_paws_loss": 0.02064836397767067, + "eval_paws_runtime": 1.3691, + "eval_paws_samples_per_second": 82.537, + "eval_paws_steps_per_second": 0.73, + "step": 649 + }, + { + "epoch": 1.6556122448979593, + "eval_global_dataset_loss": 0.3361644446849823, + "eval_global_dataset_runtime": 15.8388, + "eval_global_dataset_samples_per_second": 16.163, + "eval_global_dataset_steps_per_second": 0.063, + "step": 649 + }, + { + "epoch": 1.6581632653061225, + "grad_norm": 1.91592538356781, + "learning_rate": 6.954575077381763e-05, + "loss": 0.0788, + "step": 650 + }, + { + "epoch": 1.6607142857142856, + "grad_norm": 1.683900237083435, + "learning_rate": 6.944130386890577e-05, + "loss": 0.0917, + "step": 651 + }, + { + "epoch": 1.663265306122449, + "grad_norm": 1.102791428565979, + "learning_rate": 6.933682952459358e-05, + "loss": 0.035, + "step": 652 + }, + { + "epoch": 1.6658163265306123, + "grad_norm": 1.2626458406448364, + "learning_rate": 6.923232877406599e-05, + "loss": 0.0411, + "step": 653 + }, + { + "epoch": 1.6683673469387754, + "grad_norm": 1.6429839134216309, + "learning_rate": 6.912780265076908e-05, + "loss": 0.059, + "step": 654 + }, + { + "epoch": 1.6709183673469388, + "grad_norm": 0.8500655293464661, + "learning_rate": 6.902325218839982e-05, + "loss": 0.035, + "step": 655 + }, + { + "epoch": 1.6734693877551021, + "grad_norm": 1.4818044900894165, + "learning_rate": 6.891867842089595e-05, + "loss": 0.0791, + "step": 656 + }, + { + "epoch": 1.6760204081632653, + "grad_norm": 1.1503280401229858, + "learning_rate": 6.881408238242561e-05, + "loss": 0.0235, + "step": 657 + }, + { + "epoch": 1.6785714285714286, + "grad_norm": 1.7550290822982788, + "learning_rate": 6.870946510737721e-05, + "loss": 0.0869, + "step": 658 + }, + { + "epoch": 1.681122448979592, + "grad_norm": 1.5277817249298096, + "learning_rate": 6.86048276303492e-05, + "loss": 0.0886, + "step": 659 + }, + { + "epoch": 1.683673469387755, + "grad_norm": 1.4250398874282837, + "learning_rate": 6.850017098613978e-05, + "loss": 0.0461, + "step": 660 + }, + { + "epoch": 1.6862244897959182, + "grad_norm": 1.5854361057281494, + "learning_rate": 6.839549620973671e-05, + "loss": 0.0761, + "step": 661 + }, + { + "epoch": 1.6887755102040818, + "grad_norm": 1.3406635522842407, + "learning_rate": 6.829080433630708e-05, + "loss": 0.0488, + "step": 662 + }, + { + "epoch": 1.691326530612245, + "grad_norm": 1.5655086040496826, + "learning_rate": 6.818609640118706e-05, + "loss": 0.0531, + "step": 663 + }, + { + "epoch": 1.693877551020408, + "grad_norm": 1.8102643489837646, + "learning_rate": 6.808137343987162e-05, + "loss": 0.0801, + "step": 664 + }, + { + "epoch": 1.6964285714285714, + "grad_norm": 0.8688121438026428, + "learning_rate": 6.797663648800439e-05, + "loss": 0.0326, + "step": 665 + }, + { + "epoch": 1.6989795918367347, + "grad_norm": 0.719764769077301, + "learning_rate": 6.787188658136732e-05, + "loss": 0.0147, + "step": 666 + }, + { + "epoch": 1.7015306122448979, + "grad_norm": 1.119425654411316, + "learning_rate": 6.776712475587046e-05, + "loss": 0.0252, + "step": 667 + }, + { + "epoch": 1.7040816326530612, + "grad_norm": 1.630712866783142, + "learning_rate": 6.766235204754177e-05, + "loss": 0.0776, + "step": 668 + }, + { + "epoch": 1.7066326530612246, + "grad_norm": 0.7583345174789429, + "learning_rate": 6.755756949251684e-05, + "loss": 0.0213, + "step": 669 + }, + { + "epoch": 1.7091836734693877, + "grad_norm": 0.6478736400604248, + "learning_rate": 6.745277812702857e-05, + "loss": 0.0213, + "step": 670 + }, + { + "epoch": 1.711734693877551, + "grad_norm": 1.6099013090133667, + "learning_rate": 6.734797898739707e-05, + "loss": 0.093, + "step": 671 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 1.715428113937378, + "learning_rate": 6.724317311001926e-05, + "loss": 0.0989, + "step": 672 + }, + { + "epoch": 1.7168367346938775, + "grad_norm": 0.88747638463974, + "learning_rate": 6.713836153135876e-05, + "loss": 0.0264, + "step": 673 + }, + { + "epoch": 1.7193877551020407, + "grad_norm": 1.2536309957504272, + "learning_rate": 6.70335452879355e-05, + "loss": 0.059, + "step": 674 + }, + { + "epoch": 1.7219387755102042, + "grad_norm": 1.5352704524993896, + "learning_rate": 6.692872541631562e-05, + "loss": 0.1042, + "step": 675 + }, + { + "epoch": 1.7244897959183674, + "grad_norm": 0.957023561000824, + "learning_rate": 6.682390295310108e-05, + "loss": 0.0509, + "step": 676 + }, + { + "epoch": 1.7270408163265305, + "grad_norm": 0.9214121103286743, + "learning_rate": 6.671907893491948e-05, + "loss": 0.0224, + "step": 677 + }, + { + "epoch": 1.7295918367346939, + "grad_norm": 1.7822436094284058, + "learning_rate": 6.661425439841384e-05, + "loss": 0.067, + "step": 678 + }, + { + "epoch": 1.7321428571428572, + "grad_norm": 2.8248071670532227, + "learning_rate": 6.650943038023228e-05, + "loss": 0.3453, + "step": 679 + }, + { + "epoch": 1.7346938775510203, + "grad_norm": 2.031522035598755, + "learning_rate": 6.640460791701772e-05, + "loss": 0.1518, + "step": 680 + }, + { + "epoch": 1.7372448979591837, + "grad_norm": 1.5122132301330566, + "learning_rate": 6.629978804539785e-05, + "loss": 0.0635, + "step": 681 + }, + { + "epoch": 1.739795918367347, + "grad_norm": 1.4238784313201904, + "learning_rate": 6.619497180197461e-05, + "loss": 0.0669, + "step": 682 + }, + { + "epoch": 1.7423469387755102, + "grad_norm": 1.4297114610671997, + "learning_rate": 6.60901602233141e-05, + "loss": 0.0635, + "step": 683 + }, + { + "epoch": 1.7448979591836735, + "grad_norm": 1.754320502281189, + "learning_rate": 6.598535434593629e-05, + "loss": 0.1072, + "step": 684 + }, + { + "epoch": 1.7474489795918369, + "grad_norm": 0.35999032855033875, + "learning_rate": 6.588055520630479e-05, + "loss": 0.008, + "step": 685 + }, + { + "epoch": 1.75, + "grad_norm": 1.4126272201538086, + "learning_rate": 6.577576384081651e-05, + "loss": 0.0338, + "step": 686 + }, + { + "epoch": 1.7525510204081631, + "grad_norm": 1.7516684532165527, + "learning_rate": 6.567098128579157e-05, + "loss": 0.0986, + "step": 687 + }, + { + "epoch": 1.7551020408163265, + "grad_norm": 1.5485588312149048, + "learning_rate": 6.556620857746288e-05, + "loss": 0.0534, + "step": 688 + }, + { + "epoch": 1.7576530612244898, + "grad_norm": 1.1567020416259766, + "learning_rate": 6.546144675196604e-05, + "loss": 0.0464, + "step": 689 + }, + { + "epoch": 1.760204081632653, + "grad_norm": 1.2045881748199463, + "learning_rate": 6.535669684532897e-05, + "loss": 0.0364, + "step": 690 + }, + { + "epoch": 1.7627551020408163, + "grad_norm": 0.9928134679794312, + "learning_rate": 6.525195989346173e-05, + "loss": 0.0276, + "step": 691 + }, + { + "epoch": 1.7653061224489797, + "grad_norm": 1.3721133470535278, + "learning_rate": 6.514723693214631e-05, + "loss": 0.0598, + "step": 692 + }, + { + "epoch": 1.7678571428571428, + "grad_norm": 1.2204275131225586, + "learning_rate": 6.504252899702627e-05, + "loss": 0.0372, + "step": 693 + }, + { + "epoch": 1.7704081632653061, + "grad_norm": 1.1454434394836426, + "learning_rate": 6.493783712359663e-05, + "loss": 0.0759, + "step": 694 + }, + { + "epoch": 1.7729591836734695, + "grad_norm": 1.0549447536468506, + "learning_rate": 6.483316234719357e-05, + "loss": 0.0294, + "step": 695 + }, + { + "epoch": 1.7755102040816326, + "grad_norm": 1.4000238180160522, + "learning_rate": 6.472850570298415e-05, + "loss": 0.0537, + "step": 696 + }, + { + "epoch": 1.7780612244897958, + "grad_norm": 1.1177244186401367, + "learning_rate": 6.462386822595614e-05, + "loss": 0.0608, + "step": 697 + }, + { + "epoch": 1.7806122448979593, + "grad_norm": 2.3013904094696045, + "learning_rate": 6.451925095090773e-05, + "loss": 0.1348, + "step": 698 + }, + { + "epoch": 1.7831632653061225, + "grad_norm": 1.2447232007980347, + "learning_rate": 6.441465491243739e-05, + "loss": 0.0588, + "step": 699 + }, + { + "epoch": 1.7857142857142856, + "grad_norm": 1.355269193649292, + "learning_rate": 6.431008114493352e-05, + "loss": 0.0524, + "step": 700 + }, + { + "epoch": 1.788265306122449, + "grad_norm": 1.0311199426651, + "learning_rate": 6.420553068256427e-05, + "loss": 0.0247, + "step": 701 + }, + { + "epoch": 1.7908163265306123, + "grad_norm": 1.6527416706085205, + "learning_rate": 6.410100455926737e-05, + "loss": 0.0673, + "step": 702 + }, + { + "epoch": 1.7933673469387754, + "grad_norm": 0.961963415145874, + "learning_rate": 6.399650380873976e-05, + "loss": 0.0206, + "step": 703 + }, + { + "epoch": 1.7959183673469388, + "grad_norm": 1.7512751817703247, + "learning_rate": 6.389202946442758e-05, + "loss": 0.109, + "step": 704 + }, + { + "epoch": 1.7984693877551021, + "grad_norm": 1.3092968463897705, + "learning_rate": 6.378758255951573e-05, + "loss": 0.0592, + "step": 705 + }, + { + "epoch": 1.8010204081632653, + "grad_norm": 1.4627022743225098, + "learning_rate": 6.368316412691779e-05, + "loss": 0.071, + "step": 706 + }, + { + "epoch": 1.8035714285714286, + "grad_norm": 1.0733097791671753, + "learning_rate": 6.357877519926577e-05, + "loss": 0.0589, + "step": 707 + }, + { + "epoch": 1.806122448979592, + "grad_norm": 1.573248267173767, + "learning_rate": 6.347441680889988e-05, + "loss": 0.0772, + "step": 708 + }, + { + "epoch": 1.806122448979592, + "eval_NLI_loss": 0.8252545595169067, + "eval_NLI_runtime": 7.1379, + "eval_NLI_samples_per_second": 11.908, + "eval_NLI_steps_per_second": 0.14, + "eval_Qnli-dev_cosine_accuracy": 0.7109375, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.7013107538223267, + "eval_Qnli-dev_cosine_ap": 0.7275670751574244, + "eval_Qnli-dev_cosine_f1": 0.7101449275362318, + "eval_Qnli-dev_cosine_f1_threshold": 0.5595828294754028, + "eval_Qnli-dev_cosine_mcc": 0.4058199598560115, + "eval_Qnli-dev_cosine_precision": 0.620253164556962, + "eval_Qnli-dev_cosine_recall": 0.8305084745762712, + "eval_allNLI-dev_cosine_accuracy": 0.78125, + "eval_allNLI-dev_cosine_accuracy_threshold": 0.6589555144309998, + "eval_allNLI-dev_cosine_ap": 0.6682119018867241, + "eval_allNLI-dev_cosine_f1": 0.72, + "eval_allNLI-dev_cosine_f1_threshold": 0.6589555144309998, + "eval_allNLI-dev_cosine_mcc": 0.5608411256005545, + "eval_allNLI-dev_cosine_precision": 0.631578947368421, + "eval_allNLI-dev_cosine_recall": 0.8372093023255814, + "eval_sequential_score": 0.7275670751574244, + "eval_sts-test_pearson_cosine": 0.9045591699302508, + "eval_sts-test_spearman_cosine": 0.9176380849603257, + "step": 708 + }, + { + "epoch": 1.806122448979592, + "eval_natural-questions_loss": 0.13228358328342438, + "eval_natural-questions_runtime": 42.1549, + "eval_natural-questions_samples_per_second": 2.681, + "eval_natural-questions_steps_per_second": 0.024, + "step": 708 + }, + { + "epoch": 1.806122448979592, + "eval_vitaminc_loss": 2.501589298248291, + "eval_vitaminc_runtime": 1.501, + "eval_vitaminc_samples_per_second": 75.282, + "eval_vitaminc_steps_per_second": 0.666, + "step": 708 + }, + { + "epoch": 1.806122448979592, + "eval_xsum_loss": 0.10769912600517273, + "eval_xsum_runtime": 7.592, + "eval_xsum_samples_per_second": 14.884, + "eval_xsum_steps_per_second": 0.132, + "step": 708 + }, + { + "epoch": 1.806122448979592, + "eval_paws_loss": 0.020791849121451378, + "eval_paws_runtime": 1.3627, + "eval_paws_samples_per_second": 82.922, + "eval_paws_steps_per_second": 0.734, + "step": 708 + }, + { + "epoch": 1.806122448979592, + "eval_global_dataset_loss": 0.30187565088272095, + "eval_global_dataset_runtime": 15.8267, + "eval_global_dataset_samples_per_second": 16.175, + "eval_global_dataset_steps_per_second": 0.063, + "step": 708 + }, + { + "epoch": 1.808673469387755, + "grad_norm": 0.7930084466934204, + "learning_rate": 6.337008998785835e-05, + "loss": 0.0416, + "step": 709 + }, + { + "epoch": 1.8112244897959182, + "grad_norm": 0.8516618013381958, + "learning_rate": 6.32657957678672e-05, + "loss": 0.0622, + "step": 710 + }, + { + "epoch": 1.8137755102040818, + "grad_norm": 0.7785334587097168, + "learning_rate": 6.316153518033007e-05, + "loss": 0.0237, + "step": 711 + }, + { + "epoch": 1.816326530612245, + "grad_norm": 1.2170414924621582, + "learning_rate": 6.305730925631794e-05, + "loss": 0.0598, + "step": 712 + }, + { + "epoch": 1.818877551020408, + "grad_norm": 0.9199419021606445, + "learning_rate": 6.295311902655905e-05, + "loss": 0.0259, + "step": 713 + }, + { + "epoch": 1.8214285714285714, + "grad_norm": 1.5218782424926758, + "learning_rate": 6.284896552142864e-05, + "loss": 0.075, + "step": 714 + }, + { + "epoch": 1.8239795918367347, + "grad_norm": 1.5817890167236328, + "learning_rate": 6.274484977093874e-05, + "loss": 0.0778, + "step": 715 + }, + { + "epoch": 1.8265306122448979, + "grad_norm": 1.4637110233306885, + "learning_rate": 6.264077280472798e-05, + "loss": 0.0424, + "step": 716 + }, + { + "epoch": 1.8290816326530612, + "grad_norm": 1.3012669086456299, + "learning_rate": 6.253673565205151e-05, + "loss": 0.0711, + "step": 717 + }, + { + "epoch": 1.8316326530612246, + "grad_norm": 1.0491163730621338, + "learning_rate": 6.243273934177072e-05, + "loss": 0.0508, + "step": 718 + }, + { + "epoch": 1.8341836734693877, + "grad_norm": 1.3723552227020264, + "learning_rate": 6.232878490234309e-05, + "loss": 0.0765, + "step": 719 + }, + { + "epoch": 1.836734693877551, + "grad_norm": 1.2096433639526367, + "learning_rate": 6.222487336181202e-05, + "loss": 0.0457, + "step": 720 + }, + { + "epoch": 1.8392857142857144, + "grad_norm": 2.9045143127441406, + "learning_rate": 6.212100574779669e-05, + "loss": 0.1753, + "step": 721 + }, + { + "epoch": 1.8418367346938775, + "grad_norm": 0.9910968542098999, + "learning_rate": 6.20171830874818e-05, + "loss": 0.0397, + "step": 722 + }, + { + "epoch": 1.8443877551020407, + "grad_norm": 1.330057978630066, + "learning_rate": 6.191340640760764e-05, + "loss": 0.0708, + "step": 723 + }, + { + "epoch": 1.8469387755102042, + "grad_norm": 0.2348063588142395, + "learning_rate": 6.180967673445963e-05, + "loss": 0.035, + "step": 724 + }, + { + "epoch": 1.8494897959183674, + "grad_norm": 2.617224931716919, + "learning_rate": 6.170599509385837e-05, + "loss": 0.5028, + "step": 725 + }, + { + "epoch": 1.8520408163265305, + "grad_norm": 1.698957920074463, + "learning_rate": 6.160236251114949e-05, + "loss": 0.0604, + "step": 726 + }, + { + "epoch": 1.8545918367346939, + "grad_norm": 1.582006812095642, + "learning_rate": 6.14987800111934e-05, + "loss": 0.0838, + "step": 727 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 1.8623155355453491, + "learning_rate": 6.13952486183553e-05, + "loss": 0.1376, + "step": 728 + }, + { + "epoch": 1.8596938775510203, + "grad_norm": 1.8422627449035645, + "learning_rate": 6.129176935649493e-05, + "loss": 0.0947, + "step": 729 + }, + { + "epoch": 1.8622448979591837, + "grad_norm": 2.4534711837768555, + "learning_rate": 6.118834324895648e-05, + "loss": 0.3682, + "step": 730 + }, + { + "epoch": 1.864795918367347, + "grad_norm": 1.6681021451950073, + "learning_rate": 6.108497131855848e-05, + "loss": 0.0916, + "step": 731 + }, + { + "epoch": 1.8673469387755102, + "grad_norm": 1.103634238243103, + "learning_rate": 6.098165458758368e-05, + "loss": 0.0413, + "step": 732 + }, + { + "epoch": 1.8698979591836735, + "grad_norm": 1.2579349279403687, + "learning_rate": 6.0878394077768995e-05, + "loss": 0.037, + "step": 733 + }, + { + "epoch": 1.8724489795918369, + "grad_norm": 1.2993561029434204, + "learning_rate": 6.077519081029528e-05, + "loss": 0.0439, + "step": 734 + }, + { + "epoch": 1.875, + "grad_norm": 1.3299835920333862, + "learning_rate": 6.067204580577734e-05, + "loss": 0.0511, + "step": 735 + }, + { + "epoch": 1.8775510204081631, + "grad_norm": 1.7977983951568604, + "learning_rate": 6.056896008425379e-05, + "loss": 0.0639, + "step": 736 + }, + { + "epoch": 1.8801020408163265, + "grad_norm": 0.7083833813667297, + "learning_rate": 6.046593466517694e-05, + "loss": 0.0189, + "step": 737 + }, + { + "epoch": 1.8826530612244898, + "grad_norm": 1.153009295463562, + "learning_rate": 6.036297056740279e-05, + "loss": 0.0726, + "step": 738 + }, + { + "epoch": 1.885204081632653, + "grad_norm": 1.7064392566680908, + "learning_rate": 6.026006880918092e-05, + "loss": 0.0966, + "step": 739 + }, + { + "epoch": 1.8877551020408163, + "grad_norm": 0.32830068469047546, + "learning_rate": 6.015723040814434e-05, + "loss": 0.0848, + "step": 740 + }, + { + "epoch": 1.8903061224489797, + "grad_norm": 1.043041467666626, + "learning_rate": 6.0054456381299585e-05, + "loss": 0.038, + "step": 741 + }, + { + "epoch": 1.8928571428571428, + "grad_norm": 1.5883595943450928, + "learning_rate": 5.9951747745016495e-05, + "loss": 0.0846, + "step": 742 + }, + { + "epoch": 1.8954081632653061, + "grad_norm": 1.3991661071777344, + "learning_rate": 5.984910551501831e-05, + "loss": 0.0832, + "step": 743 + }, + { + "epoch": 1.8979591836734695, + "grad_norm": 0.9537280201911926, + "learning_rate": 5.9746530706371495e-05, + "loss": 0.033, + "step": 744 + }, + { + "epoch": 1.9005102040816326, + "grad_norm": 1.6849956512451172, + "learning_rate": 5.96440243334758e-05, + "loss": 0.0761, + "step": 745 + }, + { + "epoch": 1.9030612244897958, + "grad_norm": 0.3973057270050049, + "learning_rate": 5.9541587410054156e-05, + "loss": 0.012, + "step": 746 + }, + { + "epoch": 1.9056122448979593, + "grad_norm": 0.7332958579063416, + "learning_rate": 5.9439220949142694e-05, + "loss": 0.0153, + "step": 747 + }, + { + "epoch": 1.9081632653061225, + "grad_norm": 1.4115517139434814, + "learning_rate": 5.9336925963080757e-05, + "loss": 0.0663, + "step": 748 + }, + { + "epoch": 1.9107142857142856, + "grad_norm": 1.229833722114563, + "learning_rate": 5.923470346350077e-05, + "loss": 0.0436, + "step": 749 + }, + { + "epoch": 1.913265306122449, + "grad_norm": 1.4304879903793335, + "learning_rate": 5.913255446131838e-05, + "loss": 0.0511, + "step": 750 + }, + { + "epoch": 1.9158163265306123, + "grad_norm": 1.460972785949707, + "learning_rate": 5.903047996672234e-05, + "loss": 0.0698, + "step": 751 + }, + { + "epoch": 1.9183673469387754, + "grad_norm": 1.250166416168213, + "learning_rate": 5.892848098916462e-05, + "loss": 0.0322, + "step": 752 + }, + { + "epoch": 1.9209183673469388, + "grad_norm": 0.8898938298225403, + "learning_rate": 5.882655853735034e-05, + "loss": 0.0245, + "step": 753 + }, + { + "epoch": 1.9234693877551021, + "grad_norm": 0.6683658361434937, + "learning_rate": 5.872471361922785e-05, + "loss": 0.0245, + "step": 754 + }, + { + "epoch": 1.9260204081632653, + "grad_norm": 0.9458708763122559, + "learning_rate": 5.86229472419787e-05, + "loss": 0.027, + "step": 755 + }, + { + "epoch": 1.9285714285714286, + "grad_norm": 1.328016757965088, + "learning_rate": 5.8521260412007785e-05, + "loss": 0.0507, + "step": 756 + }, + { + "epoch": 1.931122448979592, + "grad_norm": 1.1458810567855835, + "learning_rate": 5.841965413493326e-05, + "loss": 0.0587, + "step": 757 + }, + { + "epoch": 1.933673469387755, + "grad_norm": 1.8090777397155762, + "learning_rate": 5.8318129415576725e-05, + "loss": 0.0697, + "step": 758 + }, + { + "epoch": 1.9362244897959182, + "grad_norm": 1.4781396389007568, + "learning_rate": 5.821668725795318e-05, + "loss": 0.0789, + "step": 759 + }, + { + "epoch": 1.9387755102040818, + "grad_norm": 1.463580846786499, + "learning_rate": 5.811532866526117e-05, + "loss": 0.0577, + "step": 760 + }, + { + "epoch": 1.941326530612245, + "grad_norm": 1.7788509130477905, + "learning_rate": 5.801405463987283e-05, + "loss": 0.0919, + "step": 761 + }, + { + "epoch": 1.943877551020408, + "grad_norm": 1.735632061958313, + "learning_rate": 5.791286618332394e-05, + "loss": 0.0737, + "step": 762 + }, + { + "epoch": 1.9464285714285714, + "grad_norm": 1.3253400325775146, + "learning_rate": 5.781176429630413e-05, + "loss": 0.069, + "step": 763 + }, + { + "epoch": 1.9489795918367347, + "grad_norm": 1.9220123291015625, + "learning_rate": 5.771074997864686e-05, + "loss": 0.1402, + "step": 764 + }, + { + "epoch": 1.9515306122448979, + "grad_norm": 1.455010175704956, + "learning_rate": 5.76098242293196e-05, + "loss": 0.0599, + "step": 765 + }, + { + "epoch": 1.9540816326530612, + "grad_norm": 0.6742501258850098, + "learning_rate": 5.750898804641392e-05, + "loss": 0.042, + "step": 766 + }, + { + "epoch": 1.9566326530612246, + "grad_norm": 2.1479434967041016, + "learning_rate": 5.740824242713565e-05, + "loss": 0.1276, + "step": 767 + }, + { + "epoch": 1.9566326530612246, + "eval_NLI_loss": 0.8538345694541931, + "eval_NLI_runtime": 7.14, + "eval_NLI_samples_per_second": 11.905, + "eval_NLI_steps_per_second": 0.14, + "eval_Qnli-dev_cosine_accuracy": 0.703125, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.7312531471252441, + "eval_Qnli-dev_cosine_ap": 0.7243762296174704, + "eval_Qnli-dev_cosine_f1": 0.7183098591549295, + "eval_Qnli-dev_cosine_f1_threshold": 0.548336386680603, + "eval_Qnli-dev_cosine_mcc": 0.4182713391161482, + "eval_Qnli-dev_cosine_precision": 0.6144578313253012, + "eval_Qnli-dev_cosine_recall": 0.864406779661017, + "eval_allNLI-dev_cosine_accuracy": 0.78125, + "eval_allNLI-dev_cosine_accuracy_threshold": 0.6732373237609863, + "eval_allNLI-dev_cosine_ap": 0.66692875458475, + "eval_allNLI-dev_cosine_f1": 0.72, + "eval_allNLI-dev_cosine_f1_threshold": 0.6590439081192017, + "eval_allNLI-dev_cosine_mcc": 0.5608411256005545, + "eval_allNLI-dev_cosine_precision": 0.631578947368421, + "eval_allNLI-dev_cosine_recall": 0.8372093023255814, + "eval_sequential_score": 0.7243762296174704, + "eval_sts-test_pearson_cosine": 0.9043945442719601, + "eval_sts-test_spearman_cosine": 0.9181132986050412, + "step": 767 + }, + { + "epoch": 1.9566326530612246, + "eval_natural-questions_loss": 0.11677658557891846, + "eval_natural-questions_runtime": 42.1731, + "eval_natural-questions_samples_per_second": 2.679, + "eval_natural-questions_steps_per_second": 0.024, + "step": 767 + }, + { + "epoch": 1.9566326530612246, + "eval_vitaminc_loss": 2.523444890975952, + "eval_vitaminc_runtime": 1.5087, + "eval_vitaminc_samples_per_second": 74.901, + "eval_vitaminc_steps_per_second": 0.663, + "step": 767 + }, + { + "epoch": 1.9566326530612246, + "eval_xsum_loss": 0.09602713584899902, + "eval_xsum_runtime": 7.5886, + "eval_xsum_samples_per_second": 14.891, + "eval_xsum_steps_per_second": 0.132, + "step": 767 + }, + { + "epoch": 1.9566326530612246, + "eval_paws_loss": 0.020384209230542183, + "eval_paws_runtime": 1.3714, + "eval_paws_samples_per_second": 82.398, + "eval_paws_steps_per_second": 0.729, + "step": 767 + }, + { + "epoch": 1.9566326530612246, + "eval_global_dataset_loss": 0.3154963552951813, + "eval_global_dataset_runtime": 15.8408, + "eval_global_dataset_samples_per_second": 16.161, + "eval_global_dataset_steps_per_second": 0.063, + "step": 767 + }, + { + "epoch": 1.9591836734693877, + "grad_norm": 1.4790257215499878, + "learning_rate": 5.730758836779503e-05, + "loss": 0.0995, + "step": 768 + }, + { + "epoch": 1.961734693877551, + "grad_norm": 1.158976674079895, + "learning_rate": 5.7207026863796774e-05, + "loss": 0.0381, + "step": 769 + }, + { + "epoch": 1.9642857142857144, + "grad_norm": 2.385294198989868, + "learning_rate": 5.7106558909630334e-05, + "loss": 0.1053, + "step": 770 + }, + { + "epoch": 1.9668367346938775, + "grad_norm": 1.3777903318405151, + "learning_rate": 5.7006185498859975e-05, + "loss": 0.0385, + "step": 771 + }, + { + "epoch": 1.9693877551020407, + "grad_norm": 1.1980561017990112, + "learning_rate": 5.6905907624114996e-05, + "loss": 0.0452, + "step": 772 + }, + { + "epoch": 1.9719387755102042, + "grad_norm": 1.155332088470459, + "learning_rate": 5.680572627707993e-05, + "loss": 0.0702, + "step": 773 + }, + { + "epoch": 1.9744897959183674, + "grad_norm": 1.0926625728607178, + "learning_rate": 5.6705642448484684e-05, + "loss": 0.0506, + "step": 774 + }, + { + "epoch": 1.9770408163265305, + "grad_norm": 0.9221295714378357, + "learning_rate": 5.660565712809478e-05, + "loss": 0.0351, + "step": 775 + }, + { + "epoch": 1.9795918367346939, + "grad_norm": 2.4870173931121826, + "learning_rate": 5.650577130470156e-05, + "loss": 0.2187, + "step": 776 + }, + { + "epoch": 1.9821428571428572, + "grad_norm": 0.862179696559906, + "learning_rate": 5.640598596611242e-05, + "loss": 0.0222, + "step": 777 + }, + { + "epoch": 1.9846938775510203, + "grad_norm": 1.220673680305481, + "learning_rate": 5.630630209914096e-05, + "loss": 0.0532, + "step": 778 + }, + { + "epoch": 1.9872448979591837, + "grad_norm": 1.639878273010254, + "learning_rate": 5.620672068959739e-05, + "loss": 0.067, + "step": 779 + }, + { + "epoch": 1.989795918367347, + "grad_norm": 1.053244948387146, + "learning_rate": 5.6107242722278586e-05, + "loss": 0.0833, + "step": 780 + }, + { + "epoch": 1.9923469387755102, + "grad_norm": 1.3326953649520874, + "learning_rate": 5.60078691809585e-05, + "loss": 0.0442, + "step": 781 + }, + { + "epoch": 1.9948979591836735, + "grad_norm": 1.4838842153549194, + "learning_rate": 5.590860104837836e-05, + "loss": 0.0744, + "step": 782 + }, + { + "epoch": 1.9974489795918369, + "grad_norm": 1.2816078662872314, + "learning_rate": 5.580943930623699e-05, + "loss": 0.047, + "step": 783 + }, + { + "epoch": 2.0, + "grad_norm": 0.010865384712815285, + "learning_rate": 5.571038493518105e-05, + "loss": 0.0001, + "step": 784 + }, + { + "epoch": 2.002551020408163, + "grad_norm": 0.37452027201652527, + "learning_rate": 5.561143891479538e-05, + "loss": 0.0084, + "step": 785 + }, + { + "epoch": 2.0051020408163267, + "grad_norm": 0.24096043407917023, + "learning_rate": 5.551260222359329e-05, + "loss": 0.0048, + "step": 786 + }, + { + "epoch": 2.00765306122449, + "grad_norm": 0.7445675730705261, + "learning_rate": 5.5413875839006924e-05, + "loss": 0.0306, + "step": 787 + }, + { + "epoch": 2.010204081632653, + "grad_norm": 0.8497123122215271, + "learning_rate": 5.531526073737756e-05, + "loss": 0.0197, + "step": 788 + }, + { + "epoch": 2.0127551020408165, + "grad_norm": 0.8968077301979065, + "learning_rate": 5.52167578939459e-05, + "loss": 0.0228, + "step": 789 + }, + { + "epoch": 2.0153061224489797, + "grad_norm": 0.9922608733177185, + "learning_rate": 5.511836828284256e-05, + "loss": 0.0464, + "step": 790 + }, + { + "epoch": 2.017857142857143, + "grad_norm": 1.3334237337112427, + "learning_rate": 5.5020092877078314e-05, + "loss": 0.0705, + "step": 791 + }, + { + "epoch": 2.020408163265306, + "grad_norm": 0.8067697286605835, + "learning_rate": 5.4921932648534524e-05, + "loss": 0.031, + "step": 792 + }, + { + "epoch": 2.0229591836734695, + "grad_norm": 0.6346778869628906, + "learning_rate": 5.482388856795355e-05, + "loss": 0.0212, + "step": 793 + }, + { + "epoch": 2.0255102040816326, + "grad_norm": 1.3385857343673706, + "learning_rate": 5.47259616049291e-05, + "loss": 0.0541, + "step": 794 + }, + { + "epoch": 2.0280612244897958, + "grad_norm": 0.418996661901474, + "learning_rate": 5.462815272789664e-05, + "loss": 0.0144, + "step": 795 + }, + { + "epoch": 2.0306122448979593, + "grad_norm": 0.5900735855102539, + "learning_rate": 5.453046290412387e-05, + "loss": 0.0101, + "step": 796 + }, + { + "epoch": 2.0331632653061225, + "grad_norm": 1.5913845300674438, + "learning_rate": 5.4432893099701164e-05, + "loss": 0.0737, + "step": 797 + }, + { + "epoch": 2.0357142857142856, + "grad_norm": 0.41410014033317566, + "learning_rate": 5.43354442795319e-05, + "loss": 0.0116, + "step": 798 + }, + { + "epoch": 2.038265306122449, + "grad_norm": 1.8331905603408813, + "learning_rate": 5.423811740732305e-05, + "loss": 0.1078, + "step": 799 + }, + { + "epoch": 2.0408163265306123, + "grad_norm": 1.3418947458267212, + "learning_rate": 5.41409134455756e-05, + "loss": 0.0574, + "step": 800 + }, + { + "epoch": 2.0433673469387754, + "grad_norm": 0.5035209655761719, + "learning_rate": 5.4043833355575e-05, + "loss": 0.0106, + "step": 801 + }, + { + "epoch": 2.045918367346939, + "grad_norm": 1.231858253479004, + "learning_rate": 5.3946878097381714e-05, + "loss": 0.0404, + "step": 802 + }, + { + "epoch": 2.048469387755102, + "grad_norm": 1.5155589580535889, + "learning_rate": 5.38500486298217e-05, + "loss": 0.065, + "step": 803 + }, + { + "epoch": 2.0510204081632653, + "grad_norm": 1.0770633220672607, + "learning_rate": 5.37533459104769e-05, + "loss": 0.0386, + "step": 804 + }, + { + "epoch": 2.0535714285714284, + "grad_norm": 1.2623214721679688, + "learning_rate": 5.365677089567582e-05, + "loss": 0.037, + "step": 805 + }, + { + "epoch": 2.056122448979592, + "grad_norm": 0.914726197719574, + "learning_rate": 5.356032454048404e-05, + "loss": 0.0232, + "step": 806 + }, + { + "epoch": 2.058673469387755, + "grad_norm": 1.2015514373779297, + "learning_rate": 5.346400779869477e-05, + "loss": 0.0672, + "step": 807 + }, + { + "epoch": 2.061224489795918, + "grad_norm": 0.6268401741981506, + "learning_rate": 5.3367821622819455e-05, + "loss": 0.0354, + "step": 808 + }, + { + "epoch": 2.063775510204082, + "grad_norm": 0.8188040256500244, + "learning_rate": 5.32717669640783e-05, + "loss": 0.0287, + "step": 809 + }, + { + "epoch": 2.066326530612245, + "grad_norm": 1.0910505056381226, + "learning_rate": 5.317584477239089e-05, + "loss": 0.0466, + "step": 810 + }, + { + "epoch": 2.068877551020408, + "grad_norm": 1.2691212892532349, + "learning_rate": 5.30800559963668e-05, + "loss": 0.0339, + "step": 811 + }, + { + "epoch": 2.0714285714285716, + "grad_norm": 1.0486149787902832, + "learning_rate": 5.298440158329622e-05, + "loss": 0.0712, + "step": 812 + }, + { + "epoch": 2.0739795918367347, + "grad_norm": 1.570844292640686, + "learning_rate": 5.288888247914054e-05, + "loss": 0.0843, + "step": 813 + }, + { + "epoch": 2.076530612244898, + "grad_norm": 1.2492882013320923, + "learning_rate": 5.279349962852308e-05, + "loss": 0.0485, + "step": 814 + }, + { + "epoch": 2.079081632653061, + "grad_norm": 0.9154626131057739, + "learning_rate": 5.269825397471968e-05, + "loss": 0.0363, + "step": 815 + }, + { + "epoch": 2.0816326530612246, + "grad_norm": 2.1015210151672363, + "learning_rate": 5.260314645964936e-05, + "loss": 0.1128, + "step": 816 + }, + { + "epoch": 2.0841836734693877, + "grad_norm": 0.22980009019374847, + "learning_rate": 5.250817802386507e-05, + "loss": 0.0395, + "step": 817 + }, + { + "epoch": 2.086734693877551, + "grad_norm": 0.6007477045059204, + "learning_rate": 5.241334960654437e-05, + "loss": 0.0218, + "step": 818 + }, + { + "epoch": 2.0892857142857144, + "grad_norm": 1.1683710813522339, + "learning_rate": 5.231866214548007e-05, + "loss": 0.049, + "step": 819 + }, + { + "epoch": 2.0918367346938775, + "grad_norm": 1.20526921749115, + "learning_rate": 5.2224116577071056e-05, + "loss": 0.0445, + "step": 820 + }, + { + "epoch": 2.0943877551020407, + "grad_norm": 1.0474904775619507, + "learning_rate": 5.2129713836312965e-05, + "loss": 0.0886, + "step": 821 + }, + { + "epoch": 2.0969387755102042, + "grad_norm": 1.392988681793213, + "learning_rate": 5.203545485678898e-05, + "loss": 0.0477, + "step": 822 + }, + { + "epoch": 2.0994897959183674, + "grad_norm": 0.8716573119163513, + "learning_rate": 5.194134057066057e-05, + "loss": 0.0179, + "step": 823 + }, + { + "epoch": 2.1020408163265305, + "grad_norm": 1.156198501586914, + "learning_rate": 5.184737190865826e-05, + "loss": 0.0253, + "step": 824 + }, + { + "epoch": 2.104591836734694, + "grad_norm": 0.8615087866783142, + "learning_rate": 5.1753549800072444e-05, + "loss": 0.0156, + "step": 825 + }, + { + "epoch": 2.107142857142857, + "grad_norm": 0.9159624576568604, + "learning_rate": 5.165987517274422e-05, + "loss": 0.0235, + "step": 826 + }, + { + "epoch": 2.107142857142857, + "eval_NLI_loss": 0.8645581603050232, + "eval_NLI_runtime": 7.1367, + "eval_NLI_samples_per_second": 11.91, + "eval_NLI_steps_per_second": 0.14, + "eval_Qnli-dev_cosine_accuracy": 0.734375, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.6398617029190063, + "eval_Qnli-dev_cosine_ap": 0.7324710546594881, + "eval_Qnli-dev_cosine_f1": 0.7194244604316545, + "eval_Qnli-dev_cosine_f1_threshold": 0.5438587665557861, + "eval_Qnli-dev_cosine_mcc": 0.4249062491421595, + "eval_Qnli-dev_cosine_precision": 0.625, + "eval_Qnli-dev_cosine_recall": 0.847457627118644, + "eval_allNLI-dev_cosine_accuracy": 0.765625, + "eval_allNLI-dev_cosine_accuracy_threshold": 0.6523656845092773, + "eval_allNLI-dev_cosine_ap": 0.6651555400627556, + "eval_allNLI-dev_cosine_f1": 0.7058823529411764, + "eval_allNLI-dev_cosine_f1_threshold": 0.6523656845092773, + "eval_allNLI-dev_cosine_mcc": 0.5368906701581015, + "eval_allNLI-dev_cosine_precision": 0.6101694915254238, + "eval_allNLI-dev_cosine_recall": 0.8372093023255814, + "eval_sequential_score": 0.7324710546594881, + "eval_sts-test_pearson_cosine": 0.9040696211810237, + "eval_sts-test_spearman_cosine": 0.9166627759695616, + "step": 826 + }, + { + "epoch": 2.107142857142857, + "eval_natural-questions_loss": 0.13093246519565582, + "eval_natural-questions_runtime": 42.1661, + "eval_natural-questions_samples_per_second": 2.68, + "eval_natural-questions_steps_per_second": 0.024, + "step": 826 + }, + { + "epoch": 2.107142857142857, + "eval_vitaminc_loss": 2.4608752727508545, + "eval_vitaminc_runtime": 1.5361, + "eval_vitaminc_samples_per_second": 73.564, + "eval_vitaminc_steps_per_second": 0.651, + "step": 826 + }, + { + "epoch": 2.107142857142857, + "eval_xsum_loss": 0.09627868235111237, + "eval_xsum_runtime": 7.5817, + "eval_xsum_samples_per_second": 14.904, + "eval_xsum_steps_per_second": 0.132, + "step": 826 + }, + { + "epoch": 2.107142857142857, + "eval_paws_loss": 0.020066432654857635, + "eval_paws_runtime": 1.3679, + "eval_paws_samples_per_second": 82.61, + "eval_paws_steps_per_second": 0.731, + "step": 826 + }, + { + "epoch": 2.107142857142857, + "eval_global_dataset_loss": 0.2870742082595825, + "eval_global_dataset_runtime": 15.8305, + "eval_global_dataset_samples_per_second": 16.171, + "eval_global_dataset_steps_per_second": 0.063, + "step": 826 + }, + { + "epoch": 2.1096938775510203, + "grad_norm": 0.7153167128562927, + "learning_rate": 5.156634895305621e-05, + "loss": 0.013, + "step": 827 + }, + { + "epoch": 2.1122448979591835, + "grad_norm": 0.910072386264801, + "learning_rate": 5.14729720659233e-05, + "loss": 0.0197, + "step": 828 + }, + { + "epoch": 2.114795918367347, + "grad_norm": 1.066644549369812, + "learning_rate": 5.137974543478365e-05, + "loss": 0.0346, + "step": 829 + }, + { + "epoch": 2.11734693877551, + "grad_norm": 0.9016681909561157, + "learning_rate": 5.1286669981589443e-05, + "loss": 0.0353, + "step": 830 + }, + { + "epoch": 2.1198979591836733, + "grad_norm": 0.45996737480163574, + "learning_rate": 5.119374662679779e-05, + "loss": 0.0133, + "step": 831 + }, + { + "epoch": 2.122448979591837, + "grad_norm": 1.0577362775802612, + "learning_rate": 5.1100976289361734e-05, + "loss": 0.0382, + "step": 832 + }, + { + "epoch": 2.125, + "grad_norm": 1.185167908668518, + "learning_rate": 5.100835988672098e-05, + "loss": 0.0511, + "step": 833 + }, + { + "epoch": 2.127551020408163, + "grad_norm": 0.32819658517837524, + "learning_rate": 5.091589833479294e-05, + "loss": 0.0082, + "step": 834 + }, + { + "epoch": 2.1301020408163267, + "grad_norm": 1.3892436027526855, + "learning_rate": 5.082359254796366e-05, + "loss": 0.0424, + "step": 835 + }, + { + "epoch": 2.13265306122449, + "grad_norm": 0.6624505519866943, + "learning_rate": 5.0731443439078804e-05, + "loss": 0.016, + "step": 836 + }, + { + "epoch": 2.135204081632653, + "grad_norm": 0.8647292852401733, + "learning_rate": 5.063945191943451e-05, + "loss": 0.0227, + "step": 837 + }, + { + "epoch": 2.137755102040816, + "grad_norm": 0.6319569945335388, + "learning_rate": 5.054761889876852e-05, + "loss": 0.0172, + "step": 838 + }, + { + "epoch": 2.1403061224489797, + "grad_norm": 0.8893619179725647, + "learning_rate": 5.0455945285251114e-05, + "loss": 0.0156, + "step": 839 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.714139997959137, + "learning_rate": 5.03644319854761e-05, + "loss": 0.028, + "step": 840 + }, + { + "epoch": 2.145408163265306, + "grad_norm": 0.6543142795562744, + "learning_rate": 5.027307990445195e-05, + "loss": 0.0169, + "step": 841 + }, + { + "epoch": 2.1479591836734695, + "grad_norm": 1.596405029296875, + "learning_rate": 5.0181889945592716e-05, + "loss": 0.0636, + "step": 842 + }, + { + "epoch": 2.1505102040816326, + "grad_norm": 0.887543797492981, + "learning_rate": 5.00908630107092e-05, + "loss": 0.0321, + "step": 843 + }, + { + "epoch": 2.1530612244897958, + "grad_norm": 1.5371559858322144, + "learning_rate": 5.0000000000000016e-05, + "loss": 0.0814, + "step": 844 + }, + { + "epoch": 2.1556122448979593, + "grad_norm": 1.0012476444244385, + "learning_rate": 4.990930181204261e-05, + "loss": 0.0382, + "step": 845 + }, + { + "epoch": 2.1581632653061225, + "grad_norm": 1.147342562675476, + "learning_rate": 4.981876934378451e-05, + "loss": 0.0382, + "step": 846 + }, + { + "epoch": 2.1607142857142856, + "grad_norm": 0.5801677703857422, + "learning_rate": 4.972840349053436e-05, + "loss": 0.0478, + "step": 847 + }, + { + "epoch": 2.163265306122449, + "grad_norm": 1.3068974018096924, + "learning_rate": 4.9638205145953056e-05, + "loss": 0.0352, + "step": 848 + }, + { + "epoch": 2.1658163265306123, + "grad_norm": 2.0608410835266113, + "learning_rate": 4.954817520204496e-05, + "loss": 0.1091, + "step": 849 + }, + { + "epoch": 2.1683673469387754, + "grad_norm": 0.794710099697113, + "learning_rate": 4.945831454914904e-05, + "loss": 0.0119, + "step": 850 + }, + { + "epoch": 2.170918367346939, + "grad_norm": 1.326310157775879, + "learning_rate": 4.936862407593015e-05, + "loss": 0.0543, + "step": 851 + }, + { + "epoch": 2.173469387755102, + "grad_norm": 1.0612013339996338, + "learning_rate": 4.927910466937008e-05, + "loss": 0.0319, + "step": 852 + }, + { + "epoch": 2.1760204081632653, + "grad_norm": 0.27044421434402466, + "learning_rate": 4.9189757214758946e-05, + "loss": 0.0373, + "step": 853 + }, + { + "epoch": 2.1785714285714284, + "grad_norm": 2.0077497959136963, + "learning_rate": 4.910058259568636e-05, + "loss": 0.12, + "step": 854 + }, + { + "epoch": 2.181122448979592, + "grad_norm": 0.46827951073646545, + "learning_rate": 4.9011581694032646e-05, + "loss": 0.0099, + "step": 855 + }, + { + "epoch": 2.183673469387755, + "grad_norm": 0.8061081171035767, + "learning_rate": 4.8922755389960276e-05, + "loss": 0.0213, + "step": 856 + }, + { + "epoch": 2.186224489795918, + "grad_norm": 0.7275615930557251, + "learning_rate": 4.883410456190496e-05, + "loss": 0.019, + "step": 857 + }, + { + "epoch": 2.188775510204082, + "grad_norm": 1.4744590520858765, + "learning_rate": 4.87456300865671e-05, + "loss": 0.0661, + "step": 858 + }, + { + "epoch": 2.191326530612245, + "grad_norm": 1.3601109981536865, + "learning_rate": 4.865733283890311e-05, + "loss": 0.0562, + "step": 859 + }, + { + "epoch": 2.193877551020408, + "grad_norm": 1.0560811758041382, + "learning_rate": 4.856921369211669e-05, + "loss": 0.0285, + "step": 860 + }, + { + "epoch": 2.1964285714285716, + "grad_norm": 1.2819187641143799, + "learning_rate": 4.848127351765027e-05, + "loss": 0.0625, + "step": 861 + }, + { + "epoch": 2.1989795918367347, + "grad_norm": 0.3279810845851898, + "learning_rate": 4.839351318517634e-05, + "loss": 0.0065, + "step": 862 + }, + { + "epoch": 2.201530612244898, + "grad_norm": 1.2752312421798706, + "learning_rate": 4.830593356258888e-05, + "loss": 0.0357, + "step": 863 + }, + { + "epoch": 2.204081632653061, + "grad_norm": 0.6669948101043701, + "learning_rate": 4.8218535515994726e-05, + "loss": 0.026, + "step": 864 + }, + { + "epoch": 2.2066326530612246, + "grad_norm": 0.5878826379776001, + "learning_rate": 4.813131990970509e-05, + "loss": 0.0144, + "step": 865 + }, + { + "epoch": 2.2091836734693877, + "grad_norm": 0.46150872111320496, + "learning_rate": 4.8044287606226936e-05, + "loss": 0.0227, + "step": 866 + }, + { + "epoch": 2.211734693877551, + "grad_norm": 0.8356884121894836, + "learning_rate": 4.795743946625449e-05, + "loss": 0.0415, + "step": 867 + }, + { + "epoch": 2.2142857142857144, + "grad_norm": 0.7677668929100037, + "learning_rate": 4.7870776348660754e-05, + "loss": 0.0484, + "step": 868 + }, + { + "epoch": 2.2168367346938775, + "grad_norm": 0.7589597702026367, + "learning_rate": 4.778429911048888e-05, + "loss": 0.0406, + "step": 869 + }, + { + "epoch": 2.2193877551020407, + "grad_norm": 0.8130046129226685, + "learning_rate": 4.769800860694387e-05, + "loss": 0.0183, + "step": 870 + }, + { + "epoch": 2.2219387755102042, + "grad_norm": 1.2456495761871338, + "learning_rate": 4.7611905691384054e-05, + "loss": 0.0585, + "step": 871 + }, + { + "epoch": 2.2244897959183674, + "grad_norm": 1.1810108423233032, + "learning_rate": 4.752599121531256e-05, + "loss": 0.0446, + "step": 872 + }, + { + "epoch": 2.2270408163265305, + "grad_norm": 1.539719820022583, + "learning_rate": 4.744026602836902e-05, + "loss": 0.0763, + "step": 873 + }, + { + "epoch": 2.229591836734694, + "grad_norm": 0.7696815729141235, + "learning_rate": 4.7354730978321114e-05, + "loss": 0.0115, + "step": 874 + }, + { + "epoch": 2.232142857142857, + "grad_norm": 1.0498219728469849, + "learning_rate": 4.726938691105617e-05, + "loss": 0.0384, + "step": 875 + }, + { + "epoch": 2.2346938775510203, + "grad_norm": 0.7437430024147034, + "learning_rate": 4.718423467057284e-05, + "loss": 0.0093, + "step": 876 + }, + { + "epoch": 2.237244897959184, + "grad_norm": 2.278657913208008, + "learning_rate": 4.709927509897272e-05, + "loss": 0.4063, + "step": 877 + }, + { + "epoch": 2.239795918367347, + "grad_norm": 1.5163755416870117, + "learning_rate": 4.7014509036452036e-05, + "loss": 0.0469, + "step": 878 + }, + { + "epoch": 2.24234693877551, + "grad_norm": 0.44955262541770935, + "learning_rate": 4.6929937321293306e-05, + "loss": 0.0522, + "step": 879 + }, + { + "epoch": 2.2448979591836733, + "grad_norm": 1.1311742067337036, + "learning_rate": 4.6845560789857113e-05, + "loss": 0.037, + "step": 880 + }, + { + "epoch": 2.247448979591837, + "grad_norm": 0.905463695526123, + "learning_rate": 4.676138027657381e-05, + "loss": 0.0345, + "step": 881 + }, + { + "epoch": 2.25, + "grad_norm": 0.740746259689331, + "learning_rate": 4.6677396613935176e-05, + "loss": 0.0433, + "step": 882 + }, + { + "epoch": 2.252551020408163, + "grad_norm": 1.2629139423370361, + "learning_rate": 4.659361063248636e-05, + "loss": 0.0564, + "step": 883 + }, + { + "epoch": 2.2551020408163267, + "grad_norm": 0.4700453579425812, + "learning_rate": 4.6510023160817496e-05, + "loss": 0.0088, + "step": 884 + }, + { + "epoch": 2.25765306122449, + "grad_norm": 1.5105782747268677, + "learning_rate": 4.642663502555561e-05, + "loss": 0.0453, + "step": 885 + }, + { + "epoch": 2.25765306122449, + "eval_NLI_loss": 0.8796668648719788, + "eval_NLI_runtime": 7.1506, + "eval_NLI_samples_per_second": 11.887, + "eval_NLI_steps_per_second": 0.14, + "eval_Qnli-dev_cosine_accuracy": 0.7265625, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.6527288556098938, + "eval_Qnli-dev_cosine_ap": 0.7336812042057156, + "eval_Qnli-dev_cosine_f1": 0.7285714285714285, + "eval_Qnli-dev_cosine_f1_threshold": 0.5432453751564026, + "eval_Qnli-dev_cosine_mcc": 0.44427085115126513, + "eval_Qnli-dev_cosine_precision": 0.6296296296296297, + "eval_Qnli-dev_cosine_recall": 0.864406779661017, + "eval_allNLI-dev_cosine_accuracy": 0.765625, + "eval_allNLI-dev_cosine_accuracy_threshold": 0.6623817682266235, + "eval_allNLI-dev_cosine_ap": 0.6711595088563489, + "eval_allNLI-dev_cosine_f1": 0.7047619047619047, + "eval_allNLI-dev_cosine_f1_threshold": 0.6251412034034729, + "eval_allNLI-dev_cosine_mcc": 0.5352526699107679, + "eval_allNLI-dev_cosine_precision": 0.5967741935483871, + "eval_allNLI-dev_cosine_recall": 0.8604651162790697, + "eval_sequential_score": 0.7336812042057156, + "eval_sts-test_pearson_cosine": 0.9029692074179523, + "eval_sts-test_spearman_cosine": 0.9160408676900055, + "step": 885 + }, + { + "epoch": 2.25765306122449, + "eval_natural-questions_loss": 0.12700842320919037, + "eval_natural-questions_runtime": 42.1814, + "eval_natural-questions_samples_per_second": 2.679, + "eval_natural-questions_steps_per_second": 0.024, + "step": 885 + }, + { + "epoch": 2.25765306122449, + "eval_vitaminc_loss": 2.391439914703369, + "eval_vitaminc_runtime": 1.5128, + "eval_vitaminc_samples_per_second": 74.694, + "eval_vitaminc_steps_per_second": 0.661, + "step": 885 + }, + { + "epoch": 2.25765306122449, + "eval_xsum_loss": 0.10802821069955826, + "eval_xsum_runtime": 7.5894, + "eval_xsum_samples_per_second": 14.889, + "eval_xsum_steps_per_second": 0.132, + "step": 885 + }, + { + "epoch": 2.25765306122449, + "eval_paws_loss": 0.01981884427368641, + "eval_paws_runtime": 1.3732, + "eval_paws_samples_per_second": 82.293, + "eval_paws_steps_per_second": 0.728, + "step": 885 + }, + { + "epoch": 2.25765306122449, + "eval_global_dataset_loss": 0.28854385018348694, + "eval_global_dataset_runtime": 15.8413, + "eval_global_dataset_samples_per_second": 16.16, + "eval_global_dataset_steps_per_second": 0.063, + "step": 885 + }, + { + "epoch": 2.260204081632653, + "grad_norm": 1.133180022239685, + "learning_rate": 4.634344705135644e-05, + "loss": 0.0458, + "step": 886 + }, + { + "epoch": 2.262755102040816, + "grad_norm": 1.0984524488449097, + "learning_rate": 4.626046006089623e-05, + "loss": 0.035, + "step": 887 + }, + { + "epoch": 2.2653061224489797, + "grad_norm": 1.1998791694641113, + "learning_rate": 4.6177674874863596e-05, + "loss": 0.0296, + "step": 888 + }, + { + "epoch": 2.267857142857143, + "grad_norm": 0.8947032690048218, + "learning_rate": 4.6095092311951504e-05, + "loss": 0.0188, + "step": 889 + }, + { + "epoch": 2.270408163265306, + "grad_norm": 1.3751288652420044, + "learning_rate": 4.6012713188849055e-05, + "loss": 0.0759, + "step": 890 + }, + { + "epoch": 2.2729591836734695, + "grad_norm": 0.6995999813079834, + "learning_rate": 4.5930538320233465e-05, + "loss": 0.0136, + "step": 891 + }, + { + "epoch": 2.2755102040816326, + "grad_norm": 1.6041380167007446, + "learning_rate": 4.584856851876205e-05, + "loss": 0.0596, + "step": 892 + }, + { + "epoch": 2.2780612244897958, + "grad_norm": 2.4001729488372803, + "learning_rate": 4.576680459506405e-05, + "loss": 0.2908, + "step": 893 + }, + { + "epoch": 2.2806122448979593, + "grad_norm": 1.2532007694244385, + "learning_rate": 4.5685247357732815e-05, + "loss": 0.0413, + "step": 894 + }, + { + "epoch": 2.2831632653061225, + "grad_norm": 0.639712929725647, + "learning_rate": 4.5603897613317594e-05, + "loss": 0.017, + "step": 895 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.5773519277572632, + "learning_rate": 4.552275616631575e-05, + "loss": 0.0161, + "step": 896 + }, + { + "epoch": 2.288265306122449, + "grad_norm": 1.1382379531860352, + "learning_rate": 4.5441823819164654e-05, + "loss": 0.0636, + "step": 897 + }, + { + "epoch": 2.2908163265306123, + "grad_norm": 1.212735891342163, + "learning_rate": 4.5361101372233844e-05, + "loss": 0.0259, + "step": 898 + }, + { + "epoch": 2.2933673469387754, + "grad_norm": 0.6108598709106445, + "learning_rate": 4.528058962381706e-05, + "loss": 0.0124, + "step": 899 + }, + { + "epoch": 2.295918367346939, + "grad_norm": 0.9034563302993774, + "learning_rate": 4.520028937012439e-05, + "loss": 0.0227, + "step": 900 + }, + { + "epoch": 2.298469387755102, + "grad_norm": 1.5188766717910767, + "learning_rate": 4.5120201405274364e-05, + "loss": 0.0917, + "step": 901 + }, + { + "epoch": 2.3010204081632653, + "grad_norm": 1.3282853364944458, + "learning_rate": 4.504032652128609e-05, + "loss": 0.0512, + "step": 902 + }, + { + "epoch": 2.3035714285714284, + "grad_norm": 0.37245428562164307, + "learning_rate": 4.496066550807148e-05, + "loss": 0.0086, + "step": 903 + }, + { + "epoch": 2.306122448979592, + "grad_norm": 0.41504737734794617, + "learning_rate": 4.488121915342736e-05, + "loss": 0.0068, + "step": 904 + }, + { + "epoch": 2.308673469387755, + "grad_norm": 0.6557116508483887, + "learning_rate": 4.480198824302775e-05, + "loss": 0.0134, + "step": 905 + }, + { + "epoch": 2.311224489795918, + "grad_norm": 1.189799189567566, + "learning_rate": 4.472297356041604e-05, + "loss": 0.0374, + "step": 906 + }, + { + "epoch": 2.313775510204082, + "grad_norm": 1.1441396474838257, + "learning_rate": 4.464417588699729e-05, + "loss": 0.0358, + "step": 907 + }, + { + "epoch": 2.316326530612245, + "grad_norm": 2.222879409790039, + "learning_rate": 4.456559600203045e-05, + "loss": 0.3427, + "step": 908 + }, + { + "epoch": 2.318877551020408, + "grad_norm": 0.5802832841873169, + "learning_rate": 4.4487234682620684e-05, + "loss": 0.0152, + "step": 909 + }, + { + "epoch": 2.3214285714285716, + "grad_norm": 0.796970784664154, + "learning_rate": 4.440909270371172e-05, + "loss": 0.0187, + "step": 910 + }, + { + "epoch": 2.3239795918367347, + "grad_norm": 0.3190988302230835, + "learning_rate": 4.4331170838078086e-05, + "loss": 0.0062, + "step": 911 + }, + { + "epoch": 2.326530612244898, + "grad_norm": 2.06770920753479, + "learning_rate": 4.425346985631759e-05, + "loss": 0.0839, + "step": 912 + }, + { + "epoch": 2.329081632653061, + "grad_norm": 0.3080907166004181, + "learning_rate": 4.4175990526843595e-05, + "loss": 0.0673, + "step": 913 + }, + { + "epoch": 2.3316326530612246, + "grad_norm": 1.1664047241210938, + "learning_rate": 4.4098733615877455e-05, + "loss": 0.0312, + "step": 914 + }, + { + "epoch": 2.3341836734693877, + "grad_norm": 1.4114701747894287, + "learning_rate": 4.4021699887441e-05, + "loss": 0.071, + "step": 915 + }, + { + "epoch": 2.336734693877551, + "grad_norm": 0.9187290668487549, + "learning_rate": 4.3944890103348915e-05, + "loss": 0.0294, + "step": 916 + }, + { + "epoch": 2.3392857142857144, + "grad_norm": 1.0366332530975342, + "learning_rate": 4.386830502320114e-05, + "loss": 0.0514, + "step": 917 + }, + { + "epoch": 2.3418367346938775, + "grad_norm": 1.0871851444244385, + "learning_rate": 4.379194540437555e-05, + "loss": 0.0391, + "step": 918 + }, + { + "epoch": 2.3443877551020407, + "grad_norm": 0.6311087012290955, + "learning_rate": 4.371581200202026e-05, + "loss": 0.014, + "step": 919 + }, + { + "epoch": 2.3469387755102042, + "grad_norm": 1.514604091644287, + "learning_rate": 4.36399055690463e-05, + "loss": 0.0613, + "step": 920 + }, + { + "epoch": 2.3494897959183674, + "grad_norm": 0.5482910871505737, + "learning_rate": 4.3564226856120096e-05, + "loss": 0.0106, + "step": 921 + }, + { + "epoch": 2.3520408163265305, + "grad_norm": 1.241115689277649, + "learning_rate": 4.348877661165608e-05, + "loss": 0.0584, + "step": 922 + }, + { + "epoch": 2.354591836734694, + "grad_norm": 1.6827380657196045, + "learning_rate": 4.341355558180924e-05, + "loss": 0.084, + "step": 923 + }, + { + "epoch": 2.357142857142857, + "grad_norm": 0.44441261887550354, + "learning_rate": 4.333856451046782e-05, + "loss": 0.0116, + "step": 924 + }, + { + "epoch": 2.3596938775510203, + "grad_norm": 1.0744472742080688, + "learning_rate": 4.3263804139245916e-05, + "loss": 0.0423, + "step": 925 + }, + { + "epoch": 2.362244897959184, + "grad_norm": 0.6052895188331604, + "learning_rate": 4.318927520747612e-05, + "loss": 0.015, + "step": 926 + }, + { + "epoch": 2.364795918367347, + "grad_norm": 0.8525500893592834, + "learning_rate": 4.311497845220226e-05, + "loss": 0.0186, + "step": 927 + }, + { + "epoch": 2.36734693877551, + "grad_norm": 1.460869550704956, + "learning_rate": 4.304091460817207e-05, + "loss": 0.0677, + "step": 928 + }, + { + "epoch": 2.3698979591836733, + "grad_norm": 1.2764791250228882, + "learning_rate": 4.296708440782992e-05, + "loss": 0.0653, + "step": 929 + }, + { + "epoch": 2.372448979591837, + "grad_norm": 1.3560532331466675, + "learning_rate": 4.289348858130966e-05, + "loss": 0.04, + "step": 930 + }, + { + "epoch": 2.375, + "grad_norm": 1.0868048667907715, + "learning_rate": 4.2820127856427275e-05, + "loss": 0.0428, + "step": 931 + }, + { + "epoch": 2.377551020408163, + "grad_norm": 1.2073615789413452, + "learning_rate": 4.2747002958673734e-05, + "loss": 0.0299, + "step": 932 + }, + { + "epoch": 2.3801020408163267, + "grad_norm": 1.2244027853012085, + "learning_rate": 4.2674114611207886e-05, + "loss": 0.0339, + "step": 933 + }, + { + "epoch": 2.38265306122449, + "grad_norm": 1.2330187559127808, + "learning_rate": 4.2601463534849184e-05, + "loss": 0.0625, + "step": 934 + }, + { + "epoch": 2.385204081632653, + "grad_norm": 1.6276096105575562, + "learning_rate": 4.252905044807065e-05, + "loss": 0.0688, + "step": 935 + }, + { + "epoch": 2.387755102040816, + "grad_norm": 0.8443472385406494, + "learning_rate": 4.2456876066991766e-05, + "loss": 0.0179, + "step": 936 + }, + { + "epoch": 2.3903061224489797, + "grad_norm": 0.3808949887752533, + "learning_rate": 4.238494110537131e-05, + "loss": 0.0091, + "step": 937 + }, + { + "epoch": 2.392857142857143, + "grad_norm": 1.4255057573318481, + "learning_rate": 4.231324627460038e-05, + "loss": 0.0994, + "step": 938 + }, + { + "epoch": 2.395408163265306, + "grad_norm": 1.3318042755126953, + "learning_rate": 4.224179228369534e-05, + "loss": 0.0397, + "step": 939 + }, + { + "epoch": 2.3979591836734695, + "grad_norm": 0.8308050036430359, + "learning_rate": 4.217057983929081e-05, + "loss": 0.0497, + "step": 940 + }, + { + "epoch": 2.4005102040816326, + "grad_norm": 0.7053787112236023, + "learning_rate": 4.209960964563262e-05, + "loss": 0.0108, + "step": 941 + }, + { + "epoch": 2.4030612244897958, + "grad_norm": 1.8520581722259521, + "learning_rate": 4.2028882404570966e-05, + "loss": 0.1045, + "step": 942 + }, + { + "epoch": 2.4056122448979593, + "grad_norm": 1.608733892440796, + "learning_rate": 4.1958398815553336e-05, + "loss": 0.0825, + "step": 943 + }, + { + "epoch": 2.4081632653061225, + "grad_norm": 0.9939666986465454, + "learning_rate": 4.1888159575617656e-05, + "loss": 0.0253, + "step": 944 + }, + { + "epoch": 2.4081632653061225, + "eval_NLI_loss": 0.8489585518836975, + "eval_NLI_runtime": 7.1287, + "eval_NLI_samples_per_second": 11.924, + "eval_NLI_steps_per_second": 0.14, + "eval_Qnli-dev_cosine_accuracy": 0.71875, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.678109884262085, + "eval_Qnli-dev_cosine_ap": 0.7334066331731919, + "eval_Qnli-dev_cosine_f1": 0.723404255319149, + "eval_Qnli-dev_cosine_f1_threshold": 0.5386093258857727, + "eval_Qnli-dev_cosine_mcc": 0.4312710551849853, + "eval_Qnli-dev_cosine_precision": 0.6219512195121951, + "eval_Qnli-dev_cosine_recall": 0.864406779661017, + "eval_allNLI-dev_cosine_accuracy": 0.765625, + "eval_allNLI-dev_cosine_accuracy_threshold": 0.6320295333862305, + "eval_allNLI-dev_cosine_ap": 0.6638823773283118, + "eval_allNLI-dev_cosine_f1": 0.7169811320754716, + "eval_allNLI-dev_cosine_f1_threshold": 0.6320295333862305, + "eval_allNLI-dev_cosine_mcc": 0.5570274927264262, + "eval_allNLI-dev_cosine_precision": 0.6031746031746031, + "eval_allNLI-dev_cosine_recall": 0.8837209302325582, + "eval_sequential_score": 0.7334066331731919, + "eval_sts-test_pearson_cosine": 0.9012405420333187, + "eval_sts-test_spearman_cosine": 0.9149766826941179, + "step": 944 + }, + { + "epoch": 2.4081632653061225, + "eval_natural-questions_loss": 0.1184859350323677, + "eval_natural-questions_runtime": 42.1792, + "eval_natural-questions_samples_per_second": 2.679, + "eval_natural-questions_steps_per_second": 0.024, + "step": 944 + }, + { + "epoch": 2.4081632653061225, + "eval_vitaminc_loss": 2.3981873989105225, + "eval_vitaminc_runtime": 1.5099, + "eval_vitaminc_samples_per_second": 74.842, + "eval_vitaminc_steps_per_second": 0.662, + "step": 944 + }, + { + "epoch": 2.4081632653061225, + "eval_xsum_loss": 0.10035312920808792, + "eval_xsum_runtime": 7.5877, + "eval_xsum_samples_per_second": 14.892, + "eval_xsum_steps_per_second": 0.132, + "step": 944 + }, + { + "epoch": 2.4081632653061225, + "eval_paws_loss": 0.01954275369644165, + "eval_paws_runtime": 1.367, + "eval_paws_samples_per_second": 82.665, + "eval_paws_steps_per_second": 0.732, + "step": 944 + }, + { + "epoch": 2.4081632653061225, + "eval_global_dataset_loss": 0.2837199568748474, + "eval_global_dataset_runtime": 15.8802, + "eval_global_dataset_samples_per_second": 16.121, + "eval_global_dataset_steps_per_second": 0.063, + "step": 944 + }, + { + "epoch": 2.4107142857142856, + "grad_norm": 0.6821095943450928, + "learning_rate": 4.1818165379385464e-05, + "loss": 0.0166, + "step": 945 + }, + { + "epoch": 2.413265306122449, + "grad_norm": 1.1926023960113525, + "learning_rate": 4.174841691905489e-05, + "loss": 0.0383, + "step": 946 + }, + { + "epoch": 2.4158163265306123, + "grad_norm": 1.3046187162399292, + "learning_rate": 4.1678914884393914e-05, + "loss": 0.034, + "step": 947 + }, + { + "epoch": 2.4183673469387754, + "grad_norm": 1.4456415176391602, + "learning_rate": 4.1609659962733565e-05, + "loss": 0.0541, + "step": 948 + }, + { + "epoch": 2.420918367346939, + "grad_norm": 0.8477250933647156, + "learning_rate": 4.154065283896102e-05, + "loss": 0.0368, + "step": 949 + }, + { + "epoch": 2.423469387755102, + "grad_norm": 0.8565794229507446, + "learning_rate": 4.1471894195512926e-05, + "loss": 0.0265, + "step": 950 + }, + { + "epoch": 2.4260204081632653, + "grad_norm": 1.8057187795639038, + "learning_rate": 4.1403384712368634e-05, + "loss": 0.1426, + "step": 951 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 1.1919145584106445, + "learning_rate": 4.133512506704342e-05, + "loss": 0.0408, + "step": 952 + }, + { + "epoch": 2.431122448979592, + "grad_norm": 1.759466528892517, + "learning_rate": 4.126711593458184e-05, + "loss": 0.0955, + "step": 953 + }, + { + "epoch": 2.433673469387755, + "grad_norm": 1.5274937152862549, + "learning_rate": 4.1199357987551024e-05, + "loss": 0.1052, + "step": 954 + }, + { + "epoch": 2.436224489795918, + "grad_norm": 0.8126481175422668, + "learning_rate": 4.1131851896034065e-05, + "loss": 0.0293, + "step": 955 + }, + { + "epoch": 2.438775510204082, + "grad_norm": 2.3400230407714844, + "learning_rate": 4.1064598327623346e-05, + "loss": 0.2234, + "step": 956 + }, + { + "epoch": 2.441326530612245, + "grad_norm": 1.1887966394424438, + "learning_rate": 4.099759794741397e-05, + "loss": 0.0417, + "step": 957 + }, + { + "epoch": 2.443877551020408, + "grad_norm": 1.1620210409164429, + "learning_rate": 4.093085141799714e-05, + "loss": 0.0385, + "step": 958 + }, + { + "epoch": 2.4464285714285716, + "grad_norm": 1.2781778573989868, + "learning_rate": 4.0864359399453646e-05, + "loss": 0.0373, + "step": 959 + }, + { + "epoch": 2.4489795918367347, + "grad_norm": 0.7626058459281921, + "learning_rate": 4.079812254934737e-05, + "loss": 0.0186, + "step": 960 + }, + { + "epoch": 2.451530612244898, + "grad_norm": 0.28274211287498474, + "learning_rate": 4.073214152271869e-05, + "loss": 0.0448, + "step": 961 + }, + { + "epoch": 2.454081632653061, + "grad_norm": 1.123368740081787, + "learning_rate": 4.066641697207806e-05, + "loss": 0.0414, + "step": 962 + }, + { + "epoch": 2.4566326530612246, + "grad_norm": 1.4165430068969727, + "learning_rate": 4.060094954739956e-05, + "loss": 0.0337, + "step": 963 + }, + { + "epoch": 2.4591836734693877, + "grad_norm": 1.1798646450042725, + "learning_rate": 4.053573989611448e-05, + "loss": 0.0282, + "step": 964 + }, + { + "epoch": 2.461734693877551, + "grad_norm": 1.3918527364730835, + "learning_rate": 4.047078866310484e-05, + "loss": 0.0464, + "step": 965 + }, + { + "epoch": 2.4642857142857144, + "grad_norm": 0.826269268989563, + "learning_rate": 4.040609649069711e-05, + "loss": 0.0204, + "step": 966 + }, + { + "epoch": 2.4668367346938775, + "grad_norm": 0.7990775108337402, + "learning_rate": 4.0341664018655814e-05, + "loss": 0.0209, + "step": 967 + }, + { + "epoch": 2.4693877551020407, + "grad_norm": 1.043322205543518, + "learning_rate": 4.027749188417715e-05, + "loss": 0.042, + "step": 968 + }, + { + "epoch": 2.4719387755102042, + "grad_norm": 1.7069913148880005, + "learning_rate": 4.0213580721882836e-05, + "loss": 0.0653, + "step": 969 + }, + { + "epoch": 2.4744897959183674, + "grad_norm": 1.3646513223648071, + "learning_rate": 4.0149931163813676e-05, + "loss": 0.0481, + "step": 970 + }, + { + "epoch": 2.4770408163265305, + "grad_norm": 0.8509479761123657, + "learning_rate": 4.008654383942337e-05, + "loss": 0.0421, + "step": 971 + }, + { + "epoch": 2.479591836734694, + "grad_norm": 1.2394309043884277, + "learning_rate": 4.0023419375572334e-05, + "loss": 0.0737, + "step": 972 + }, + { + "epoch": 2.482142857142857, + "grad_norm": 0.9207291007041931, + "learning_rate": 3.996055839652142e-05, + "loss": 0.032, + "step": 973 + }, + { + "epoch": 2.4846938775510203, + "grad_norm": 1.2213928699493408, + "learning_rate": 3.989796152392581e-05, + "loss": 0.0607, + "step": 974 + }, + { + "epoch": 2.487244897959184, + "grad_norm": 1.6579464673995972, + "learning_rate": 3.9835629376828846e-05, + "loss": 0.0784, + "step": 975 + }, + { + "epoch": 2.489795918367347, + "grad_norm": 0.8070271611213684, + "learning_rate": 3.9773562571655866e-05, + "loss": 0.028, + "step": 976 + }, + { + "epoch": 2.49234693877551, + "grad_norm": 1.5141855478286743, + "learning_rate": 3.971176172220818e-05, + "loss": 0.0922, + "step": 977 + }, + { + "epoch": 2.4948979591836733, + "grad_norm": 1.0904344320297241, + "learning_rate": 3.9650227439656926e-05, + "loss": 0.0376, + "step": 978 + }, + { + "epoch": 2.497448979591837, + "grad_norm": 1.3054485321044922, + "learning_rate": 3.958896033253714e-05, + "loss": 0.0346, + "step": 979 + }, + { + "epoch": 2.5, + "grad_norm": 1.2881898880004883, + "learning_rate": 3.9527961006741574e-05, + "loss": 0.0977, + "step": 980 + }, + { + "epoch": 2.502551020408163, + "grad_norm": 1.2697529792785645, + "learning_rate": 3.946723006551485e-05, + "loss": 0.0471, + "step": 981 + }, + { + "epoch": 2.5051020408163263, + "grad_norm": 1.5069528818130493, + "learning_rate": 3.940676810944742e-05, + "loss": 0.0616, + "step": 982 + }, + { + "epoch": 2.50765306122449, + "grad_norm": 0.7574511766433716, + "learning_rate": 3.934657573646961e-05, + "loss": 0.0368, + "step": 983 + }, + { + "epoch": 2.510204081632653, + "grad_norm": 0.7946884036064148, + "learning_rate": 3.928665354184583e-05, + "loss": 0.0169, + "step": 984 + }, + { + "epoch": 2.512755102040816, + "grad_norm": 1.029691219329834, + "learning_rate": 3.92270021181685e-05, + "loss": 0.0298, + "step": 985 + }, + { + "epoch": 2.5153061224489797, + "grad_norm": 0.8419845700263977, + "learning_rate": 3.9167622055352323e-05, + "loss": 0.0187, + "step": 986 + }, + { + "epoch": 2.517857142857143, + "grad_norm": 1.2529100179672241, + "learning_rate": 3.9108513940628464e-05, + "loss": 0.0529, + "step": 987 + }, + { + "epoch": 2.520408163265306, + "grad_norm": 1.389549970626831, + "learning_rate": 3.90496783585386e-05, + "loss": 0.0346, + "step": 988 + }, + { + "epoch": 2.5229591836734695, + "grad_norm": 1.6846857070922852, + "learning_rate": 3.8991115890929306e-05, + "loss": 0.0879, + "step": 989 + }, + { + "epoch": 2.5255102040816326, + "grad_norm": 1.0223442316055298, + "learning_rate": 3.8932827116946215e-05, + "loss": 0.0234, + "step": 990 + }, + { + "epoch": 2.5280612244897958, + "grad_norm": 1.0513767004013062, + "learning_rate": 3.887481261302829e-05, + "loss": 0.0493, + "step": 991 + }, + { + "epoch": 2.5306122448979593, + "grad_norm": 1.3534318208694458, + "learning_rate": 3.881707295290212e-05, + "loss": 0.0901, + "step": 992 + }, + { + "epoch": 2.5331632653061225, + "grad_norm": 1.0040754079818726, + "learning_rate": 3.8759608707576314e-05, + "loss": 0.0377, + "step": 993 + }, + { + "epoch": 2.5357142857142856, + "grad_norm": 0.9664008021354675, + "learning_rate": 3.870242044533576e-05, + "loss": 0.031, + "step": 994 + }, + { + "epoch": 2.538265306122449, + "grad_norm": 1.4631917476654053, + "learning_rate": 3.8645508731736066e-05, + "loss": 0.0572, + "step": 995 + }, + { + "epoch": 2.5408163265306123, + "grad_norm": 0.6983744502067566, + "learning_rate": 3.8588874129597946e-05, + "loss": 0.0454, + "step": 996 + }, + { + "epoch": 2.5433673469387754, + "grad_norm": 0.8332878351211548, + "learning_rate": 3.8532517199001654e-05, + "loss": 0.0433, + "step": 997 + }, + { + "epoch": 2.545918367346939, + "grad_norm": 1.5428073406219482, + "learning_rate": 3.847643849728146e-05, + "loss": 0.0589, + "step": 998 + }, + { + "epoch": 2.548469387755102, + "grad_norm": 0.7662918567657471, + "learning_rate": 3.842063857902013e-05, + "loss": 0.0217, + "step": 999 + }, + { + "epoch": 2.5510204081632653, + "grad_norm": 1.2972546815872192, + "learning_rate": 3.8365117996043406e-05, + "loss": 0.0483, + "step": 1000 + }, + { + "epoch": 2.553571428571429, + "grad_norm": 1.4217555522918701, + "learning_rate": 3.830987729741462e-05, + "loss": 0.0604, + "step": 1001 + }, + { + "epoch": 2.556122448979592, + "grad_norm": 0.8894994854927063, + "learning_rate": 3.82549170294292e-05, + "loss": 0.021, + "step": 1002 + }, + { + "epoch": 2.558673469387755, + "grad_norm": 1.4004552364349365, + "learning_rate": 3.820023773560931e-05, + "loss": 0.0577, + "step": 1003 + }, + { + "epoch": 2.558673469387755, + "eval_NLI_loss": 0.7841311693191528, + "eval_NLI_runtime": 7.1434, + "eval_NLI_samples_per_second": 11.899, + "eval_NLI_steps_per_second": 0.14, + "eval_Qnli-dev_cosine_accuracy": 0.7265625, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.6512576341629028, + "eval_Qnli-dev_cosine_ap": 0.7341027677613995, + "eval_Qnli-dev_cosine_f1": 0.7123287671232876, + "eval_Qnli-dev_cosine_f1_threshold": 0.513559103012085, + "eval_Qnli-dev_cosine_mcc": 0.3996665462019395, + "eval_Qnli-dev_cosine_precision": 0.5977011494252874, + "eval_Qnli-dev_cosine_recall": 0.8813559322033898, + "eval_allNLI-dev_cosine_accuracy": 0.7578125, + "eval_allNLI-dev_cosine_accuracy_threshold": 0.760259747505188, + "eval_allNLI-dev_cosine_ap": 0.6718319577561838, + "eval_allNLI-dev_cosine_f1": 0.7102803738317758, + "eval_allNLI-dev_cosine_f1_threshold": 0.6331969499588013, + "eval_allNLI-dev_cosine_mcc": 0.5458461472025508, + "eval_allNLI-dev_cosine_precision": 0.59375, + "eval_allNLI-dev_cosine_recall": 0.8837209302325582, + "eval_sequential_score": 0.7341027677613995, + "eval_sts-test_pearson_cosine": 0.9036567724584882, + "eval_sts-test_spearman_cosine": 0.9167097200427613, + "step": 1003 + }, + { + "epoch": 2.558673469387755, + "eval_natural-questions_loss": 0.12162257730960846, + "eval_natural-questions_runtime": 42.1766, + "eval_natural-questions_samples_per_second": 2.679, + "eval_natural-questions_steps_per_second": 0.024, + "step": 1003 + }, + { + "epoch": 2.558673469387755, + "eval_vitaminc_loss": 2.386068344116211, + "eval_vitaminc_runtime": 1.5139, + "eval_vitaminc_samples_per_second": 74.643, + "eval_vitaminc_steps_per_second": 0.661, + "step": 1003 + }, + { + "epoch": 2.558673469387755, + "eval_xsum_loss": 0.10199504345655441, + "eval_xsum_runtime": 7.591, + "eval_xsum_samples_per_second": 14.886, + "eval_xsum_steps_per_second": 0.132, + "step": 1003 + }, + { + "epoch": 2.558673469387755, + "eval_paws_loss": 0.019558368250727654, + "eval_paws_runtime": 1.3708, + "eval_paws_samples_per_second": 82.432, + "eval_paws_steps_per_second": 0.729, + "step": 1003 + }, + { + "epoch": 2.558673469387755, + "eval_global_dataset_loss": 0.2749599516391754, + "eval_global_dataset_runtime": 15.8488, + "eval_global_dataset_samples_per_second": 16.153, + "eval_global_dataset_steps_per_second": 0.063, + "step": 1003 + }, + { + "epoch": 2.561224489795918, + "grad_norm": 1.4330050945281982, + "learning_rate": 3.814583995669844e-05, + "loss": 0.0405, + "step": 1004 + }, + { + "epoch": 2.563775510204082, + "grad_norm": 1.254364252090454, + "learning_rate": 3.809172423065611e-05, + "loss": 0.0543, + "step": 1005 + }, + { + "epoch": 2.566326530612245, + "grad_norm": 0.7061953544616699, + "learning_rate": 3.803789109265247e-05, + "loss": 0.0141, + "step": 1006 + }, + { + "epoch": 2.568877551020408, + "grad_norm": 0.44512051343917847, + "learning_rate": 3.798434107506308e-05, + "loss": 0.0132, + "step": 1007 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.8475450873374939, + "learning_rate": 3.793107470746362e-05, + "loss": 0.0195, + "step": 1008 + }, + { + "epoch": 2.5739795918367347, + "grad_norm": 1.2412010431289673, + "learning_rate": 3.787809251662465e-05, + "loss": 0.0438, + "step": 1009 + }, + { + "epoch": 2.576530612244898, + "grad_norm": 1.7806142568588257, + "learning_rate": 3.78253950265064e-05, + "loss": 0.1235, + "step": 1010 + }, + { + "epoch": 2.579081632653061, + "grad_norm": 0.8530858159065247, + "learning_rate": 3.777298275825358e-05, + "loss": 0.0168, + "step": 1011 + }, + { + "epoch": 2.5816326530612246, + "grad_norm": 1.1695044040679932, + "learning_rate": 3.772085623019025e-05, + "loss": 0.0447, + "step": 1012 + }, + { + "epoch": 2.5841836734693877, + "grad_norm": 1.1708300113677979, + "learning_rate": 3.766901595781468e-05, + "loss": 0.0596, + "step": 1013 + }, + { + "epoch": 2.586734693877551, + "grad_norm": 1.2334167957305908, + "learning_rate": 3.7617462453794255e-05, + "loss": 0.0371, + "step": 1014 + }, + { + "epoch": 2.5892857142857144, + "grad_norm": 0.8684949278831482, + "learning_rate": 3.7566196227960393e-05, + "loss": 0.0174, + "step": 1015 + }, + { + "epoch": 2.5918367346938775, + "grad_norm": 1.2504810094833374, + "learning_rate": 3.751521778730353e-05, + "loss": 0.057, + "step": 1016 + }, + { + "epoch": 2.5943877551020407, + "grad_norm": 0.7756729125976562, + "learning_rate": 3.7464527635968065e-05, + "loss": 0.0163, + "step": 1017 + }, + { + "epoch": 2.5969387755102042, + "grad_norm": 0.7388819456100464, + "learning_rate": 3.741412627524741e-05, + "loss": 0.0316, + "step": 1018 + }, + { + "epoch": 2.5994897959183674, + "grad_norm": 0.6562634706497192, + "learning_rate": 3.736401420357903e-05, + "loss": 0.0175, + "step": 1019 + }, + { + "epoch": 2.6020408163265305, + "grad_norm": 1.5008758306503296, + "learning_rate": 3.7314191916539513e-05, + "loss": 0.0565, + "step": 1020 + }, + { + "epoch": 2.604591836734694, + "grad_norm": 0.5674561858177185, + "learning_rate": 3.726465990683962e-05, + "loss": 0.0161, + "step": 1021 + }, + { + "epoch": 2.607142857142857, + "grad_norm": 0.6072304248809814, + "learning_rate": 3.721541866431949e-05, + "loss": 0.0153, + "step": 1022 + }, + { + "epoch": 2.6096938775510203, + "grad_norm": 1.143333911895752, + "learning_rate": 3.7166468675943754e-05, + "loss": 0.0294, + "step": 1023 + }, + { + "epoch": 2.612244897959184, + "grad_norm": 0.9003584980964661, + "learning_rate": 3.7117810425796706e-05, + "loss": 0.031, + "step": 1024 + }, + { + "epoch": 2.614795918367347, + "grad_norm": 1.1214017868041992, + "learning_rate": 3.706944439507757e-05, + "loss": 0.0408, + "step": 1025 + }, + { + "epoch": 2.61734693877551, + "grad_norm": 1.043317198753357, + "learning_rate": 3.702137106209566e-05, + "loss": 0.0184, + "step": 1026 + }, + { + "epoch": 2.6198979591836737, + "grad_norm": 0.8998955488204956, + "learning_rate": 3.697359090226572e-05, + "loss": 0.0226, + "step": 1027 + }, + { + "epoch": 2.622448979591837, + "grad_norm": 0.7946634292602539, + "learning_rate": 3.692610438810321e-05, + "loss": 0.0192, + "step": 1028 + }, + { + "epoch": 2.625, + "grad_norm": 1.621121883392334, + "learning_rate": 3.687891198921959e-05, + "loss": 0.0667, + "step": 1029 + }, + { + "epoch": 2.627551020408163, + "grad_norm": 1.1459928750991821, + "learning_rate": 3.6832014172317734e-05, + "loss": 0.0467, + "step": 1030 + }, + { + "epoch": 2.6301020408163263, + "grad_norm": 1.4894814491271973, + "learning_rate": 3.6785411401187274e-05, + "loss": 0.0331, + "step": 1031 + }, + { + "epoch": 2.63265306122449, + "grad_norm": 1.24048912525177, + "learning_rate": 3.673910413670003e-05, + "loss": 0.0459, + "step": 1032 + }, + { + "epoch": 2.635204081632653, + "grad_norm": 0.44730693101882935, + "learning_rate": 3.669309283680544e-05, + "loss": 0.0123, + "step": 1033 + }, + { + "epoch": 2.637755102040816, + "grad_norm": 1.074154019355774, + "learning_rate": 3.664737795652605e-05, + "loss": 0.0541, + "step": 1034 + }, + { + "epoch": 2.6403061224489797, + "grad_norm": 1.7900382280349731, + "learning_rate": 3.660195994795302e-05, + "loss": 0.1093, + "step": 1035 + }, + { + "epoch": 2.642857142857143, + "grad_norm": 1.2100892066955566, + "learning_rate": 3.655683926024159e-05, + "loss": 0.0494, + "step": 1036 + }, + { + "epoch": 2.645408163265306, + "grad_norm": 1.4669338464736938, + "learning_rate": 3.651201633960673e-05, + "loss": 0.0495, + "step": 1037 + }, + { + "epoch": 2.6479591836734695, + "grad_norm": 0.7083831429481506, + "learning_rate": 3.646749162931867e-05, + "loss": 0.025, + "step": 1038 + }, + { + "epoch": 2.6505102040816326, + "grad_norm": 0.58478182554245, + "learning_rate": 3.6423265569698516e-05, + "loss": 0.0098, + "step": 1039 + }, + { + "epoch": 2.6530612244897958, + "grad_norm": 1.3051248788833618, + "learning_rate": 3.6379338598113935e-05, + "loss": 0.0376, + "step": 1040 + }, + { + "epoch": 2.6556122448979593, + "grad_norm": 1.0198227167129517, + "learning_rate": 3.633571114897478e-05, + "loss": 0.053, + "step": 1041 + }, + { + "epoch": 2.6581632653061225, + "grad_norm": 1.0635125637054443, + "learning_rate": 3.6292383653728806e-05, + "loss": 0.0332, + "step": 1042 + }, + { + "epoch": 2.6607142857142856, + "grad_norm": 0.39310547709465027, + "learning_rate": 3.624935654085746e-05, + "loss": 0.0272, + "step": 1043 + }, + { + "epoch": 2.663265306122449, + "grad_norm": 1.268485426902771, + "learning_rate": 3.6206630235871566e-05, + "loss": 0.0708, + "step": 1044 + }, + { + "epoch": 2.6658163265306123, + "grad_norm": 0.4056549072265625, + "learning_rate": 3.616420516130713e-05, + "loss": 0.01, + "step": 1045 + }, + { + "epoch": 2.6683673469387754, + "grad_norm": 0.678384006023407, + "learning_rate": 3.612208173672124e-05, + "loss": 0.0141, + "step": 1046 + }, + { + "epoch": 2.670918367346939, + "grad_norm": 1.2657197713851929, + "learning_rate": 3.608026037868778e-05, + "loss": 0.046, + "step": 1047 + }, + { + "epoch": 2.673469387755102, + "grad_norm": 0.5528343319892883, + "learning_rate": 3.603874150079346e-05, + "loss": 0.0093, + "step": 1048 + }, + { + "epoch": 2.6760204081632653, + "grad_norm": 0.851707398891449, + "learning_rate": 3.599752551363362e-05, + "loss": 0.0335, + "step": 1049 + }, + { + "epoch": 2.678571428571429, + "grad_norm": 0.1154296025633812, + "learning_rate": 3.59566128248082e-05, + "loss": 0.0021, + "step": 1050 + }, + { + "epoch": 2.681122448979592, + "grad_norm": 0.6295617818832397, + "learning_rate": 3.591600383891771e-05, + "loss": 0.0201, + "step": 1051 + }, + { + "epoch": 2.683673469387755, + "grad_norm": 1.1446092128753662, + "learning_rate": 3.587569895755925e-05, + "loss": 0.0459, + "step": 1052 + }, + { + "epoch": 2.686224489795918, + "grad_norm": 1.0561741590499878, + "learning_rate": 3.583569857932249e-05, + "loss": 0.0247, + "step": 1053 + }, + { + "epoch": 2.688775510204082, + "grad_norm": 1.1544740200042725, + "learning_rate": 3.57960030997858e-05, + "loss": 0.0535, + "step": 1054 + }, + { + "epoch": 2.691326530612245, + "grad_norm": 1.4142874479293823, + "learning_rate": 3.5756612911512255e-05, + "loss": 0.0482, + "step": 1055 + }, + { + "epoch": 2.693877551020408, + "grad_norm": 0.8935024738311768, + "learning_rate": 3.571752840404582e-05, + "loss": 0.0147, + "step": 1056 + }, + { + "epoch": 2.696428571428571, + "grad_norm": 1.8578927516937256, + "learning_rate": 3.567874996390743e-05, + "loss": 0.1053, + "step": 1057 + }, + { + "epoch": 2.6989795918367347, + "grad_norm": 0.5544687509536743, + "learning_rate": 3.5640277974591266e-05, + "loss": 0.0165, + "step": 1058 + }, + { + "epoch": 2.701530612244898, + "grad_norm": 0.6194223761558533, + "learning_rate": 3.560211281656087e-05, + "loss": 0.0115, + "step": 1059 + }, + { + "epoch": 2.704081632653061, + "grad_norm": 1.242207407951355, + "learning_rate": 3.5564254867245425e-05, + "loss": 0.0541, + "step": 1060 + }, + { + "epoch": 2.7066326530612246, + "grad_norm": 1.1520938873291016, + "learning_rate": 3.552670450103601e-05, + "loss": 0.0487, + "step": 1061 + }, + { + "epoch": 2.7091836734693877, + "grad_norm": 1.369409441947937, + "learning_rate": 3.54894620892819e-05, + "loss": 0.0567, + "step": 1062 + }, + { + "epoch": 2.7091836734693877, + "eval_NLI_loss": 0.7855402827262878, + "eval_NLI_runtime": 7.1525, + "eval_NLI_samples_per_second": 11.884, + "eval_NLI_steps_per_second": 0.14, + "eval_Qnli-dev_cosine_accuracy": 0.71875, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.6593053936958313, + "eval_Qnli-dev_cosine_ap": 0.7294178377677971, + "eval_Qnli-dev_cosine_f1": 0.7205882352941178, + "eval_Qnli-dev_cosine_f1_threshold": 0.5538477897644043, + "eval_Qnli-dev_cosine_mcc": 0.4324281836088688, + "eval_Qnli-dev_cosine_precision": 0.6363636363636364, + "eval_Qnli-dev_cosine_recall": 0.8305084745762712, + "eval_allNLI-dev_cosine_accuracy": 0.7734375, + "eval_allNLI-dev_cosine_accuracy_threshold": 0.6717847585678101, + "eval_allNLI-dev_cosine_ap": 0.6739622273633457, + "eval_allNLI-dev_cosine_f1": 0.7155963302752293, + "eval_allNLI-dev_cosine_f1_threshold": 0.6049352288246155, + "eval_allNLI-dev_cosine_mcc": 0.5569730681100454, + "eval_allNLI-dev_cosine_precision": 0.5909090909090909, + "eval_allNLI-dev_cosine_recall": 0.9069767441860465, + "eval_sequential_score": 0.7294178377677971, + "eval_sts-test_pearson_cosine": 0.9057715480270058, + "eval_sts-test_spearman_cosine": 0.9185964779742839, + "step": 1062 + }, + { + "epoch": 2.7091836734693877, + "eval_natural-questions_loss": 0.11109888553619385, + "eval_natural-questions_runtime": 42.1864, + "eval_natural-questions_samples_per_second": 2.679, + "eval_natural-questions_steps_per_second": 0.024, + "step": 1062 + }, + { + "epoch": 2.7091836734693877, + "eval_vitaminc_loss": 2.360363721847534, + "eval_vitaminc_runtime": 1.5176, + "eval_vitaminc_samples_per_second": 74.461, + "eval_vitaminc_steps_per_second": 0.659, + "step": 1062 + }, + { + "epoch": 2.7091836734693877, + "eval_xsum_loss": 0.10431700199842453, + "eval_xsum_runtime": 7.5878, + "eval_xsum_samples_per_second": 14.892, + "eval_xsum_steps_per_second": 0.132, + "step": 1062 + }, + { + "epoch": 2.7091836734693877, + "eval_paws_loss": 0.019417980685830116, + "eval_paws_runtime": 1.3712, + "eval_paws_samples_per_second": 82.411, + "eval_paws_steps_per_second": 0.729, + "step": 1062 + }, + { + "epoch": 2.7091836734693877, + "eval_global_dataset_loss": 0.2681998312473297, + "eval_global_dataset_runtime": 15.8425, + "eval_global_dataset_samples_per_second": 16.159, + "eval_global_dataset_steps_per_second": 0.063, + "step": 1062 + }, + { + "epoch": 2.711734693877551, + "grad_norm": 0.8661820292472839, + "learning_rate": 3.5452528000286924e-05, + "loss": 0.0173, + "step": 1063 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 3.1137821674346924, + "learning_rate": 3.541590259930575e-05, + "loss": 0.386, + "step": 1064 + }, + { + "epoch": 2.7168367346938775, + "grad_norm": 0.8698633909225464, + "learning_rate": 3.537958624854036e-05, + "loss": 0.0455, + "step": 1065 + }, + { + "epoch": 2.7193877551020407, + "grad_norm": 1.1439448595046997, + "learning_rate": 3.534357930713639e-05, + "loss": 0.0453, + "step": 1066 + }, + { + "epoch": 2.7219387755102042, + "grad_norm": 0.6917685866355896, + "learning_rate": 3.530788213117964e-05, + "loss": 0.0704, + "step": 1067 + }, + { + "epoch": 2.7244897959183674, + "grad_norm": 0.7231074571609497, + "learning_rate": 3.527249507369251e-05, + "loss": 0.0129, + "step": 1068 + }, + { + "epoch": 2.7270408163265305, + "grad_norm": 0.842807948589325, + "learning_rate": 3.523741848463052e-05, + "loss": 0.0238, + "step": 1069 + }, + { + "epoch": 2.729591836734694, + "grad_norm": 1.387162208557129, + "learning_rate": 3.52026527108789e-05, + "loss": 0.0703, + "step": 1070 + }, + { + "epoch": 2.732142857142857, + "grad_norm": 1.4876843690872192, + "learning_rate": 3.516819809624906e-05, + "loss": 0.0956, + "step": 1071 + }, + { + "epoch": 2.7346938775510203, + "grad_norm": 1.259988784790039, + "learning_rate": 3.513405498147525e-05, + "loss": 0.0322, + "step": 1072 + }, + { + "epoch": 2.737244897959184, + "grad_norm": 1.1273715496063232, + "learning_rate": 3.510022370421122e-05, + "loss": 0.0213, + "step": 1073 + }, + { + "epoch": 2.739795918367347, + "grad_norm": 0.5835068821907043, + "learning_rate": 3.506670459902682e-05, + "loss": 0.0077, + "step": 1074 + }, + { + "epoch": 2.74234693877551, + "grad_norm": 0.5347700119018555, + "learning_rate": 3.503349799740471e-05, + "loss": 0.0184, + "step": 1075 + }, + { + "epoch": 2.7448979591836737, + "grad_norm": 1.1496635675430298, + "learning_rate": 3.50006042277371e-05, + "loss": 0.0324, + "step": 1076 + }, + { + "epoch": 2.747448979591837, + "grad_norm": 0.6822428107261658, + "learning_rate": 3.496802361532249e-05, + "loss": 0.022, + "step": 1077 + }, + { + "epoch": 2.75, + "grad_norm": 1.1158218383789062, + "learning_rate": 3.4935756482362446e-05, + "loss": 0.0265, + "step": 1078 + }, + { + "epoch": 2.752551020408163, + "grad_norm": 1.0793883800506592, + "learning_rate": 3.490380314795844e-05, + "loss": 0.0667, + "step": 1079 + }, + { + "epoch": 2.7551020408163263, + "grad_norm": 0.7501165270805359, + "learning_rate": 3.487216392810866e-05, + "loss": 0.0125, + "step": 1080 + }, + { + "epoch": 2.75765306122449, + "grad_norm": 0.936340868473053, + "learning_rate": 3.4840839135704883e-05, + "loss": 0.0455, + "step": 1081 + }, + { + "epoch": 2.760204081632653, + "grad_norm": 0.3183143734931946, + "learning_rate": 3.4809829080529434e-05, + "loss": 0.0046, + "step": 1082 + }, + { + "epoch": 2.762755102040816, + "grad_norm": 1.5530846118927002, + "learning_rate": 3.477913406925208e-05, + "loss": 0.0617, + "step": 1083 + }, + { + "epoch": 2.7653061224489797, + "grad_norm": 1.177962064743042, + "learning_rate": 3.474875440542697e-05, + "loss": 0.0301, + "step": 1084 + }, + { + "epoch": 2.767857142857143, + "grad_norm": 1.2689491510391235, + "learning_rate": 3.471869038948972e-05, + "loss": 0.0391, + "step": 1085 + }, + { + "epoch": 2.770408163265306, + "grad_norm": 1.937947392463684, + "learning_rate": 3.468894231875433e-05, + "loss": 0.0852, + "step": 1086 + }, + { + "epoch": 2.7729591836734695, + "grad_norm": 1.9010231494903564, + "learning_rate": 3.4659510487410354e-05, + "loss": 0.0808, + "step": 1087 + }, + { + "epoch": 2.7755102040816326, + "grad_norm": 1.1008816957473755, + "learning_rate": 3.463039518651991e-05, + "loss": 0.0322, + "step": 1088 + }, + { + "epoch": 2.7780612244897958, + "grad_norm": 0.5804948806762695, + "learning_rate": 3.460159670401484e-05, + "loss": 0.0151, + "step": 1089 + }, + { + "epoch": 2.7806122448979593, + "grad_norm": 1.2755776643753052, + "learning_rate": 3.4573115324693855e-05, + "loss": 0.0493, + "step": 1090 + }, + { + "epoch": 2.7831632653061225, + "grad_norm": 1.2872555255889893, + "learning_rate": 3.454495133021971e-05, + "loss": 0.0409, + "step": 1091 + }, + { + "epoch": 2.7857142857142856, + "grad_norm": 1.603512167930603, + "learning_rate": 3.451710499911643e-05, + "loss": 0.0839, + "step": 1092 + }, + { + "epoch": 2.788265306122449, + "grad_norm": 1.4260185956954956, + "learning_rate": 3.448957660676655e-05, + "loss": 0.0813, + "step": 1093 + }, + { + "epoch": 2.7908163265306123, + "grad_norm": 1.0357199907302856, + "learning_rate": 3.4462366425408404e-05, + "loss": 0.0592, + "step": 1094 + }, + { + "epoch": 2.7933673469387754, + "grad_norm": 0.8765033483505249, + "learning_rate": 3.4435474724133394e-05, + "loss": 0.0299, + "step": 1095 + }, + { + "epoch": 2.795918367346939, + "grad_norm": 0.40279337763786316, + "learning_rate": 3.4408901768883375e-05, + "loss": 0.0131, + "step": 1096 + }, + { + "epoch": 2.798469387755102, + "grad_norm": 0.7553824186325073, + "learning_rate": 3.4382647822448006e-05, + "loss": 0.0302, + "step": 1097 + }, + { + "epoch": 2.8010204081632653, + "grad_norm": 0.8826936483383179, + "learning_rate": 3.435671314446215e-05, + "loss": 0.0203, + "step": 1098 + }, + { + "epoch": 2.803571428571429, + "grad_norm": 0.49939998984336853, + "learning_rate": 3.43310979914033e-05, + "loss": 0.0117, + "step": 1099 + }, + { + "epoch": 2.806122448979592, + "grad_norm": 1.030312180519104, + "learning_rate": 3.430580261658908e-05, + "loss": 0.0427, + "step": 1100 + }, + { + "epoch": 2.808673469387755, + "grad_norm": 1.4824496507644653, + "learning_rate": 3.4280827270174656e-05, + "loss": 0.0816, + "step": 1101 + }, + { + "epoch": 2.811224489795918, + "grad_norm": 1.1910951137542725, + "learning_rate": 3.4256172199150364e-05, + "loss": 0.05, + "step": 1102 + }, + { + "epoch": 2.813775510204082, + "grad_norm": 1.4083585739135742, + "learning_rate": 3.4231837647339205e-05, + "loss": 0.069, + "step": 1103 + }, + { + "epoch": 2.816326530612245, + "grad_norm": 1.5294753313064575, + "learning_rate": 3.420782385539444e-05, + "loss": 0.0815, + "step": 1104 + }, + { + "epoch": 2.818877551020408, + "grad_norm": 0.6797266602516174, + "learning_rate": 3.418413106079723e-05, + "loss": 0.0403, + "step": 1105 + }, + { + "epoch": 2.821428571428571, + "grad_norm": 0.9141923785209656, + "learning_rate": 3.416075949785426e-05, + "loss": 0.0384, + "step": 1106 + }, + { + "epoch": 2.8239795918367347, + "grad_norm": 0.879925549030304, + "learning_rate": 3.413770939769547e-05, + "loss": 0.0339, + "step": 1107 + }, + { + "epoch": 2.826530612244898, + "grad_norm": 1.281860589981079, + "learning_rate": 3.4114980988271686e-05, + "loss": 0.0588, + "step": 1108 + }, + { + "epoch": 2.829081632653061, + "grad_norm": 0.3762257397174835, + "learning_rate": 3.409257449435248e-05, + "loss": 0.0066, + "step": 1109 + }, + { + "epoch": 2.8316326530612246, + "grad_norm": 0.9136333465576172, + "learning_rate": 3.4070490137523833e-05, + "loss": 0.0282, + "step": 1110 + }, + { + "epoch": 2.8341836734693877, + "grad_norm": 0.8563865423202515, + "learning_rate": 3.404872813618601e-05, + "loss": 0.0259, + "step": 1111 + }, + { + "epoch": 2.836734693877551, + "grad_norm": 1.9055695533752441, + "learning_rate": 3.40272887055514e-05, + "loss": 0.1058, + "step": 1112 + }, + { + "epoch": 2.8392857142857144, + "grad_norm": 0.5798735618591309, + "learning_rate": 3.400617205764236e-05, + "loss": 0.0195, + "step": 1113 + }, + { + "epoch": 2.8418367346938775, + "grad_norm": 0.5622300505638123, + "learning_rate": 3.398537840128911e-05, + "loss": 0.0261, + "step": 1114 + }, + { + "epoch": 2.8443877551020407, + "grad_norm": 1.8140298128128052, + "learning_rate": 3.3964907942127733e-05, + "loss": 0.0771, + "step": 1115 + }, + { + "epoch": 2.8469387755102042, + "grad_norm": 0.9060431718826294, + "learning_rate": 3.394476088259806e-05, + "loss": 0.0581, + "step": 1116 + }, + { + "epoch": 2.8494897959183674, + "grad_norm": 1.3845596313476562, + "learning_rate": 3.392493742194173e-05, + "loss": 0.0372, + "step": 1117 + }, + { + "epoch": 2.8520408163265305, + "grad_norm": 1.0695796012878418, + "learning_rate": 3.390543775620017e-05, + "loss": 0.0361, + "step": 1118 + }, + { + "epoch": 2.854591836734694, + "grad_norm": 0.8523769974708557, + "learning_rate": 3.388626207821271e-05, + "loss": 0.0232, + "step": 1119 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.7926890254020691, + "learning_rate": 3.386741057761462e-05, + "loss": 0.0169, + "step": 1120 + }, + { + "epoch": 2.8596938775510203, + "grad_norm": 0.9696307182312012, + "learning_rate": 3.384888344083527e-05, + "loss": 0.0344, + "step": 1121 + }, + { + "epoch": 2.8596938775510203, + "eval_NLI_loss": 0.7702016234397888, + "eval_NLI_runtime": 7.1442, + "eval_NLI_samples_per_second": 11.898, + "eval_NLI_steps_per_second": 0.14, + "eval_Qnli-dev_cosine_accuracy": 0.734375, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.6594305038452148, + "eval_Qnli-dev_cosine_ap": 0.7355859794628049, + "eval_Qnli-dev_cosine_f1": 0.7205882352941178, + "eval_Qnli-dev_cosine_f1_threshold": 0.560278058052063, + "eval_Qnli-dev_cosine_mcc": 0.4324281836088688, + "eval_Qnli-dev_cosine_precision": 0.6363636363636364, + "eval_Qnli-dev_cosine_recall": 0.8305084745762712, + "eval_allNLI-dev_cosine_accuracy": 0.7734375, + "eval_allNLI-dev_cosine_accuracy_threshold": 0.6672780513763428, + "eval_allNLI-dev_cosine_ap": 0.6758526385311352, + "eval_allNLI-dev_cosine_f1": 0.7090909090909091, + "eval_allNLI-dev_cosine_f1_threshold": 0.6128238439559937, + "eval_allNLI-dev_cosine_mcc": 0.5461880874624497, + "eval_allNLI-dev_cosine_precision": 0.582089552238806, + "eval_allNLI-dev_cosine_recall": 0.9069767441860465, + "eval_sequential_score": 0.7355859794628049, + "eval_sts-test_pearson_cosine": 0.9066645184678404, + "eval_sts-test_spearman_cosine": 0.9181100317629596, + "step": 1121 + }, + { + "epoch": 2.8596938775510203, + "eval_natural-questions_loss": 0.11856827139854431, + "eval_natural-questions_runtime": 42.2059, + "eval_natural-questions_samples_per_second": 2.677, + "eval_natural-questions_steps_per_second": 0.024, + "step": 1121 + }, + { + "epoch": 2.8596938775510203, + "eval_vitaminc_loss": 2.3960492610931396, + "eval_vitaminc_runtime": 1.5115, + "eval_vitaminc_samples_per_second": 74.761, + "eval_vitaminc_steps_per_second": 0.662, + "step": 1121 + }, + { + "epoch": 2.8596938775510203, + "eval_xsum_loss": 0.09892440587282181, + "eval_xsum_runtime": 7.595, + "eval_xsum_samples_per_second": 14.878, + "eval_xsum_steps_per_second": 0.132, + "step": 1121 + }, + { + "epoch": 2.8596938775510203, + "eval_paws_loss": 0.019541580229997635, + "eval_paws_runtime": 1.373, + "eval_paws_samples_per_second": 82.301, + "eval_paws_steps_per_second": 0.728, + "step": 1121 + }, + { + "epoch": 2.8596938775510203, + "eval_global_dataset_loss": 0.2706809341907501, + "eval_global_dataset_runtime": 15.8493, + "eval_global_dataset_samples_per_second": 16.152, + "eval_global_dataset_steps_per_second": 0.063, + "step": 1121 + }, + { + "epoch": 2.862244897959184, + "grad_norm": 1.1326380968093872, + "learning_rate": 3.3830680851096286e-05, + "loss": 0.0285, + "step": 1122 + }, + { + "epoch": 2.864795918367347, + "grad_norm": 1.5277966260910034, + "learning_rate": 3.381280298840972e-05, + "loss": 0.07, + "step": 1123 + }, + { + "epoch": 2.86734693877551, + "grad_norm": 1.911505937576294, + "learning_rate": 3.3795250029576284e-05, + "loss": 0.0527, + "step": 1124 + }, + { + "epoch": 2.8698979591836737, + "grad_norm": 0.8949317336082458, + "learning_rate": 3.37780221481836e-05, + "loss": 0.0342, + "step": 1125 + }, + { + "epoch": 2.872448979591837, + "grad_norm": 0.43560948967933655, + "learning_rate": 3.3761119514604475e-05, + "loss": 0.0263, + "step": 1126 + }, + { + "epoch": 2.875, + "grad_norm": 2.584179162979126, + "learning_rate": 3.3744542295995226e-05, + "loss": 0.2125, + "step": 1127 + }, + { + "epoch": 2.877551020408163, + "grad_norm": 0.7703952789306641, + "learning_rate": 3.3728290656294016e-05, + "loss": 0.0312, + "step": 1128 + }, + { + "epoch": 2.8801020408163263, + "grad_norm": 0.6504500508308411, + "learning_rate": 3.3712364756219246e-05, + "loss": 0.0213, + "step": 1129 + }, + { + "epoch": 2.88265306122449, + "grad_norm": 1.1103063821792603, + "learning_rate": 3.369676475326796e-05, + "loss": 0.0524, + "step": 1130 + }, + { + "epoch": 2.885204081632653, + "grad_norm": 0.16738423705101013, + "learning_rate": 3.368149080171427e-05, + "loss": 0.0095, + "step": 1131 + }, + { + "epoch": 2.887755102040816, + "grad_norm": 1.3120450973510742, + "learning_rate": 3.366654305260787e-05, + "loss": 0.0402, + "step": 1132 + }, + { + "epoch": 2.8903061224489797, + "grad_norm": 1.2806401252746582, + "learning_rate": 3.3651921653772514e-05, + "loss": 0.0556, + "step": 1133 + }, + { + "epoch": 2.892857142857143, + "grad_norm": 0.49201884865760803, + "learning_rate": 3.363762674980454e-05, + "loss": 0.0355, + "step": 1134 + }, + { + "epoch": 2.895408163265306, + "grad_norm": 1.4078779220581055, + "learning_rate": 3.362365848207149e-05, + "loss": 0.0458, + "step": 1135 + }, + { + "epoch": 2.8979591836734695, + "grad_norm": 1.2205567359924316, + "learning_rate": 3.361001698871064e-05, + "loss": 0.0469, + "step": 1136 + }, + { + "epoch": 2.9005102040816326, + "grad_norm": 0.6651041507720947, + "learning_rate": 3.3596702404627715e-05, + "loss": 0.0389, + "step": 1137 + }, + { + "epoch": 2.9030612244897958, + "grad_norm": 0.9572398662567139, + "learning_rate": 3.35837148614955e-05, + "loss": 0.0627, + "step": 1138 + }, + { + "epoch": 2.9056122448979593, + "grad_norm": 1.1092970371246338, + "learning_rate": 3.357105448775254e-05, + "loss": 0.0535, + "step": 1139 + }, + { + "epoch": 2.9081632653061225, + "grad_norm": 1.3637953996658325, + "learning_rate": 3.35587214086019e-05, + "loss": 0.0671, + "step": 1140 + }, + { + "epoch": 2.9107142857142856, + "grad_norm": 0.7675915360450745, + "learning_rate": 3.3546715746009906e-05, + "loss": 0.0137, + "step": 1141 + }, + { + "epoch": 2.913265306122449, + "grad_norm": 0.9332019686698914, + "learning_rate": 3.353503761870492e-05, + "loss": 0.0502, + "step": 1142 + }, + { + "epoch": 2.9158163265306123, + "grad_norm": 1.0251212120056152, + "learning_rate": 3.352368714217624e-05, + "loss": 0.0259, + "step": 1143 + }, + { + "epoch": 2.9183673469387754, + "grad_norm": 0.3970426619052887, + "learning_rate": 3.3512664428672834e-05, + "loss": 0.0085, + "step": 1144 + }, + { + "epoch": 2.920918367346939, + "grad_norm": 1.292211651802063, + "learning_rate": 3.350196958720237e-05, + "loss": 0.0387, + "step": 1145 + }, + { + "epoch": 2.923469387755102, + "grad_norm": 1.2716881036758423, + "learning_rate": 3.349160272353003e-05, + "loss": 0.0438, + "step": 1146 + }, + { + "epoch": 2.9260204081632653, + "grad_norm": 0.36506468057632446, + "learning_rate": 3.3481563940177515e-05, + "loss": 0.0072, + "step": 1147 + }, + { + "epoch": 2.928571428571429, + "grad_norm": 0.9190146327018738, + "learning_rate": 3.347185333642201e-05, + "loss": 0.0209, + "step": 1148 + }, + { + "epoch": 2.931122448979592, + "grad_norm": 1.3610502481460571, + "learning_rate": 3.346247100829524e-05, + "loss": 0.0483, + "step": 1149 + }, + { + "epoch": 2.933673469387755, + "grad_norm": 1.1471080780029297, + "learning_rate": 3.345341704858245e-05, + "loss": 0.0628, + "step": 1150 + }, + { + "epoch": 2.936224489795918, + "grad_norm": 1.3273050785064697, + "learning_rate": 3.344469154682157e-05, + "loss": 0.0612, + "step": 1151 + }, + { + "epoch": 2.938775510204082, + "grad_norm": 1.2681299448013306, + "learning_rate": 3.343629458930226e-05, + "loss": 0.0419, + "step": 1152 + }, + { + "epoch": 2.941326530612245, + "grad_norm": 2.902991533279419, + "learning_rate": 3.342822625906512e-05, + "loss": 0.5139, + "step": 1153 + }, + { + "epoch": 2.943877551020408, + "grad_norm": 1.104650855064392, + "learning_rate": 3.3420486635900794e-05, + "loss": 0.0633, + "step": 1154 + }, + { + "epoch": 2.946428571428571, + "grad_norm": 0.5833232998847961, + "learning_rate": 3.3413075796349245e-05, + "loss": 0.0127, + "step": 1155 + }, + { + "epoch": 2.9489795918367347, + "grad_norm": 1.3277106285095215, + "learning_rate": 3.3405993813698974e-05, + "loss": 0.0822, + "step": 1156 + }, + { + "epoch": 2.951530612244898, + "grad_norm": 0.8056492209434509, + "learning_rate": 3.339924075798629e-05, + "loss": 0.0175, + "step": 1157 + }, + { + "epoch": 2.954081632653061, + "grad_norm": 0.21101132035255432, + "learning_rate": 3.339281669599464e-05, + "loss": 0.0047, + "step": 1158 + }, + { + "epoch": 2.9566326530612246, + "grad_norm": 1.1993032693862915, + "learning_rate": 3.33867216912539e-05, + "loss": 0.0447, + "step": 1159 + }, + { + "epoch": 2.9591836734693877, + "grad_norm": 1.1763912439346313, + "learning_rate": 3.3380955804039806e-05, + "loss": 0.0375, + "step": 1160 + }, + { + "epoch": 2.961734693877551, + "grad_norm": 1.007827639579773, + "learning_rate": 3.337551909137331e-05, + "loss": 0.0581, + "step": 1161 + }, + { + "epoch": 2.9642857142857144, + "grad_norm": 1.477920651435852, + "learning_rate": 3.337041160702007e-05, + "loss": 0.0733, + "step": 1162 + }, + { + "epoch": 2.9668367346938775, + "grad_norm": 0.9569999575614929, + "learning_rate": 3.336563340148985e-05, + "loss": 0.0211, + "step": 1163 + }, + { + "epoch": 2.9693877551020407, + "grad_norm": 1.7268577814102173, + "learning_rate": 3.336118452203607e-05, + "loss": 0.1113, + "step": 1164 + }, + { + "epoch": 2.9719387755102042, + "grad_norm": 1.1772488355636597, + "learning_rate": 3.335706501265533e-05, + "loss": 0.0554, + "step": 1165 + }, + { + "epoch": 2.9744897959183674, + "grad_norm": 1.3847665786743164, + "learning_rate": 3.3353274914086955e-05, + "loss": 0.0672, + "step": 1166 + }, + { + "epoch": 2.9770408163265305, + "grad_norm": 0.7754787802696228, + "learning_rate": 3.3349814263812615e-05, + "loss": 0.0374, + "step": 1167 + } + ], + "logging_steps": 1, + "max_steps": 1176, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 389, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 192, + "trial_name": null, + "trial_params": null +}