{ "best_metric": 0.8714768883878241, "best_model_checkpoint": "dinov2-small-imagenet1k-1-layer-finetuned-galaxy10-decals/checkpoint-1996", "epoch": 19.879759519038075, "eval_steps": 500, "global_step": 2480, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08016032064128256, "grad_norm": 59.08690643310547, "learning_rate": 2.0161290322580646e-06, "loss": 2.8233, "step": 10 }, { "epoch": 0.16032064128256512, "grad_norm": 53.841739654541016, "learning_rate": 4.032258064516129e-06, "loss": 2.1402, "step": 20 }, { "epoch": 0.24048096192384769, "grad_norm": 64.50281524658203, "learning_rate": 6.048387096774194e-06, "loss": 1.8321, "step": 30 }, { "epoch": 0.32064128256513025, "grad_norm": 122.04541015625, "learning_rate": 8.064516129032258e-06, "loss": 1.5098, "step": 40 }, { "epoch": 0.40080160320641284, "grad_norm": 96.4590835571289, "learning_rate": 1.0080645161290323e-05, "loss": 1.3558, "step": 50 }, { "epoch": 0.48096192384769537, "grad_norm": 77.96562194824219, "learning_rate": 1.2096774193548388e-05, "loss": 1.2094, "step": 60 }, { "epoch": 0.561122244488978, "grad_norm": 84.99612426757812, "learning_rate": 1.4112903225806454e-05, "loss": 1.1468, "step": 70 }, { "epoch": 0.6412825651302605, "grad_norm": 57.763580322265625, "learning_rate": 1.6129032258064517e-05, "loss": 1.1189, "step": 80 }, { "epoch": 0.7214428857715431, "grad_norm": 70.59989166259766, "learning_rate": 1.8145161290322583e-05, "loss": 1.0405, "step": 90 }, { "epoch": 0.8016032064128257, "grad_norm": 91.77726745605469, "learning_rate": 2.0161290322580645e-05, "loss": 1.0129, "step": 100 }, { "epoch": 0.8817635270541082, "grad_norm": 82.61339569091797, "learning_rate": 2.217741935483871e-05, "loss": 0.9405, "step": 110 }, { "epoch": 0.9619238476953907, "grad_norm": 74.24462127685547, "learning_rate": 2.4193548387096777e-05, "loss": 0.924, "step": 120 }, { "epoch": 0.9939879759519038, "eval_accuracy": 0.7294250281848929, "eval_loss": 0.8074643015861511, "eval_runtime": 16.5344, "eval_samples_per_second": 107.292, "eval_steps_per_second": 3.387, "step": 124 }, { "epoch": 1.0420841683366733, "grad_norm": 140.59048461914062, "learning_rate": 2.620967741935484e-05, "loss": 0.8607, "step": 130 }, { "epoch": 1.122244488977956, "grad_norm": 190.5211181640625, "learning_rate": 2.822580645161291e-05, "loss": 0.9175, "step": 140 }, { "epoch": 1.2024048096192386, "grad_norm": 83.77838897705078, "learning_rate": 3.024193548387097e-05, "loss": 0.8886, "step": 150 }, { "epoch": 1.282565130260521, "grad_norm": 68.58927917480469, "learning_rate": 3.2258064516129034e-05, "loss": 0.9134, "step": 160 }, { "epoch": 1.3627254509018036, "grad_norm": 100.79639434814453, "learning_rate": 3.427419354838709e-05, "loss": 0.8889, "step": 170 }, { "epoch": 1.4428857715430863, "grad_norm": 94.05249786376953, "learning_rate": 3.6290322580645165e-05, "loss": 0.9035, "step": 180 }, { "epoch": 1.5230460921843687, "grad_norm": 61.07211685180664, "learning_rate": 3.8306451612903224e-05, "loss": 0.9032, "step": 190 }, { "epoch": 1.6032064128256514, "grad_norm": 46.34950256347656, "learning_rate": 4.032258064516129e-05, "loss": 0.9545, "step": 200 }, { "epoch": 1.6833667334669338, "grad_norm": 59.57305908203125, "learning_rate": 4.2338709677419356e-05, "loss": 0.9195, "step": 210 }, { "epoch": 1.7635270541082164, "grad_norm": 51.62540054321289, "learning_rate": 4.435483870967742e-05, "loss": 0.9062, "step": 220 }, { "epoch": 1.843687374749499, "grad_norm": 61.9476318359375, "learning_rate": 4.637096774193548e-05, "loss": 0.8817, "step": 230 }, { "epoch": 1.9238476953907817, "grad_norm": 59.98125457763672, "learning_rate": 4.8387096774193554e-05, "loss": 0.8226, "step": 240 }, { "epoch": 1.9959919839679359, "eval_accuracy": 0.7192784667418264, "eval_loss": 0.8194935321807861, "eval_runtime": 16.4183, "eval_samples_per_second": 108.05, "eval_steps_per_second": 3.411, "step": 249 }, { "epoch": 2.004008016032064, "grad_norm": 38.62278366088867, "learning_rate": 4.995519713261649e-05, "loss": 0.8421, "step": 250 }, { "epoch": 2.0841683366733466, "grad_norm": 42.719417572021484, "learning_rate": 4.973118279569893e-05, "loss": 0.8578, "step": 260 }, { "epoch": 2.164328657314629, "grad_norm": 53.102413177490234, "learning_rate": 4.950716845878137e-05, "loss": 0.9822, "step": 270 }, { "epoch": 2.244488977955912, "grad_norm": 63.29998779296875, "learning_rate": 4.92831541218638e-05, "loss": 0.9396, "step": 280 }, { "epoch": 2.3246492985971945, "grad_norm": 44.63651657104492, "learning_rate": 4.905913978494624e-05, "loss": 0.9457, "step": 290 }, { "epoch": 2.404809619238477, "grad_norm": 47.95414733886719, "learning_rate": 4.8835125448028677e-05, "loss": 0.8546, "step": 300 }, { "epoch": 2.4849699398797593, "grad_norm": 48.8958854675293, "learning_rate": 4.8611111111111115e-05, "loss": 0.9152, "step": 310 }, { "epoch": 2.565130260521042, "grad_norm": 92.62353515625, "learning_rate": 4.8387096774193554e-05, "loss": 0.8634, "step": 320 }, { "epoch": 2.6452905811623246, "grad_norm": 35.2147216796875, "learning_rate": 4.8163082437275986e-05, "loss": 0.7998, "step": 330 }, { "epoch": 2.7254509018036073, "grad_norm": 42.57783126831055, "learning_rate": 4.7939068100358424e-05, "loss": 0.7981, "step": 340 }, { "epoch": 2.80561122244489, "grad_norm": 35.527687072753906, "learning_rate": 4.771505376344086e-05, "loss": 0.8871, "step": 350 }, { "epoch": 2.8857715430861726, "grad_norm": 114.99385070800781, "learning_rate": 4.74910394265233e-05, "loss": 0.7689, "step": 360 }, { "epoch": 2.9659318637274548, "grad_norm": 84.57234954833984, "learning_rate": 4.726702508960574e-05, "loss": 0.805, "step": 370 }, { "epoch": 2.997995991983968, "eval_accuracy": 0.790304396843292, "eval_loss": 0.6233341693878174, "eval_runtime": 16.6664, "eval_samples_per_second": 106.442, "eval_steps_per_second": 3.36, "step": 374 }, { "epoch": 3.0460921843687374, "grad_norm": 36.23030090332031, "learning_rate": 4.704301075268818e-05, "loss": 0.7415, "step": 380 }, { "epoch": 3.12625250501002, "grad_norm": 69.19331359863281, "learning_rate": 4.681899641577061e-05, "loss": 0.7203, "step": 390 }, { "epoch": 3.2064128256513027, "grad_norm": 27.987106323242188, "learning_rate": 4.659498207885305e-05, "loss": 0.6812, "step": 400 }, { "epoch": 3.2865731462925853, "grad_norm": 46.88111877441406, "learning_rate": 4.637096774193548e-05, "loss": 0.7374, "step": 410 }, { "epoch": 3.3667334669338675, "grad_norm": 38.22345733642578, "learning_rate": 4.614695340501792e-05, "loss": 0.7994, "step": 420 }, { "epoch": 3.44689378757515, "grad_norm": 55.37908172607422, "learning_rate": 4.5922939068100365e-05, "loss": 0.7888, "step": 430 }, { "epoch": 3.527054108216433, "grad_norm": 36.60285186767578, "learning_rate": 4.56989247311828e-05, "loss": 0.7572, "step": 440 }, { "epoch": 3.6072144288577155, "grad_norm": 52.20883560180664, "learning_rate": 4.5474910394265236e-05, "loss": 0.778, "step": 450 }, { "epoch": 3.687374749498998, "grad_norm": 30.91020965576172, "learning_rate": 4.5250896057347674e-05, "loss": 0.7487, "step": 460 }, { "epoch": 3.7675350701402808, "grad_norm": 21.50102996826172, "learning_rate": 4.5026881720430106e-05, "loss": 0.7287, "step": 470 }, { "epoch": 3.847695390781563, "grad_norm": 28.19744110107422, "learning_rate": 4.4802867383512545e-05, "loss": 0.7762, "step": 480 }, { "epoch": 3.9278557114228456, "grad_norm": 28.148900985717773, "learning_rate": 4.4578853046594983e-05, "loss": 0.752, "step": 490 }, { "epoch": 4.0, "eval_accuracy": 0.7795941375422774, "eval_loss": 0.6686776280403137, "eval_runtime": 16.5892, "eval_samples_per_second": 106.937, "eval_steps_per_second": 3.376, "step": 499 }, { "epoch": 4.008016032064128, "grad_norm": 40.84296798706055, "learning_rate": 4.435483870967742e-05, "loss": 0.7898, "step": 500 }, { "epoch": 4.0881763527054105, "grad_norm": 32.189735412597656, "learning_rate": 4.413082437275986e-05, "loss": 0.7361, "step": 510 }, { "epoch": 4.168336673346693, "grad_norm": 27.465147018432617, "learning_rate": 4.390681003584229e-05, "loss": 0.7795, "step": 520 }, { "epoch": 4.248496993987976, "grad_norm": 27.916139602661133, "learning_rate": 4.368279569892473e-05, "loss": 0.7467, "step": 530 }, { "epoch": 4.328657314629258, "grad_norm": 32.374847412109375, "learning_rate": 4.345878136200717e-05, "loss": 0.6427, "step": 540 }, { "epoch": 4.408817635270541, "grad_norm": 45.016536712646484, "learning_rate": 4.323476702508961e-05, "loss": 0.6988, "step": 550 }, { "epoch": 4.488977955911824, "grad_norm": 36.51612091064453, "learning_rate": 4.301075268817205e-05, "loss": 0.6696, "step": 560 }, { "epoch": 4.569138276553106, "grad_norm": 26.923110961914062, "learning_rate": 4.2786738351254486e-05, "loss": 0.6512, "step": 570 }, { "epoch": 4.649298597194389, "grad_norm": 29.096832275390625, "learning_rate": 4.256272401433692e-05, "loss": 0.6775, "step": 580 }, { "epoch": 4.729458917835672, "grad_norm": 72.05559539794922, "learning_rate": 4.2338709677419356e-05, "loss": 0.6479, "step": 590 }, { "epoch": 4.809619238476954, "grad_norm": 31.95567512512207, "learning_rate": 4.2114695340501795e-05, "loss": 0.7208, "step": 600 }, { "epoch": 4.889779559118237, "grad_norm": 21.673049926757812, "learning_rate": 4.1890681003584233e-05, "loss": 0.6655, "step": 610 }, { "epoch": 4.969939879759519, "grad_norm": 33.81110382080078, "learning_rate": 4.166666666666667e-05, "loss": 0.6193, "step": 620 }, { "epoch": 4.993987975951904, "eval_accuracy": 0.8049605411499436, "eval_loss": 0.5707754492759705, "eval_runtime": 16.6845, "eval_samples_per_second": 106.326, "eval_steps_per_second": 3.356, "step": 623 }, { "epoch": 5.050100200400801, "grad_norm": 21.3759822845459, "learning_rate": 4.1442652329749104e-05, "loss": 0.6843, "step": 630 }, { "epoch": 5.130260521042084, "grad_norm": 29.779949188232422, "learning_rate": 4.121863799283154e-05, "loss": 0.6394, "step": 640 }, { "epoch": 5.210420841683367, "grad_norm": 28.196969985961914, "learning_rate": 4.099462365591398e-05, "loss": 0.6629, "step": 650 }, { "epoch": 5.290581162324649, "grad_norm": 36.27236557006836, "learning_rate": 4.077060931899642e-05, "loss": 0.6814, "step": 660 }, { "epoch": 5.370741482965932, "grad_norm": 26.93575668334961, "learning_rate": 4.054659498207886e-05, "loss": 0.7281, "step": 670 }, { "epoch": 5.4509018036072145, "grad_norm": 38.862640380859375, "learning_rate": 4.032258064516129e-05, "loss": 0.6512, "step": 680 }, { "epoch": 5.531062124248497, "grad_norm": 22.26070213317871, "learning_rate": 4.009856630824373e-05, "loss": 0.625, "step": 690 }, { "epoch": 5.61122244488978, "grad_norm": 35.700435638427734, "learning_rate": 3.987455197132617e-05, "loss": 0.7244, "step": 700 }, { "epoch": 5.6913827655310625, "grad_norm": 29.08810806274414, "learning_rate": 3.96505376344086e-05, "loss": 0.7093, "step": 710 }, { "epoch": 5.771543086172345, "grad_norm": 33.14811706542969, "learning_rate": 3.9426523297491045e-05, "loss": 0.6039, "step": 720 }, { "epoch": 5.851703406813627, "grad_norm": 36.37564468383789, "learning_rate": 3.9202508960573483e-05, "loss": 0.6434, "step": 730 }, { "epoch": 5.9318637274549095, "grad_norm": 25.49966812133789, "learning_rate": 3.8978494623655915e-05, "loss": 0.5822, "step": 740 }, { "epoch": 5.995991983967936, "eval_accuracy": 0.8060879368658399, "eval_loss": 0.5624008178710938, "eval_runtime": 16.5355, "eval_samples_per_second": 107.284, "eval_steps_per_second": 3.387, "step": 748 }, { "epoch": 6.012024048096192, "grad_norm": 28.702117919921875, "learning_rate": 3.8754480286738354e-05, "loss": 0.5943, "step": 750 }, { "epoch": 6.092184368737475, "grad_norm": 55.89380645751953, "learning_rate": 3.8530465949820786e-05, "loss": 0.6292, "step": 760 }, { "epoch": 6.1723446893787575, "grad_norm": 25.705108642578125, "learning_rate": 3.8306451612903224e-05, "loss": 0.6334, "step": 770 }, { "epoch": 6.25250501002004, "grad_norm": 62.475040435791016, "learning_rate": 3.808243727598566e-05, "loss": 0.6187, "step": 780 }, { "epoch": 6.332665330661323, "grad_norm": 29.610858917236328, "learning_rate": 3.78584229390681e-05, "loss": 0.6301, "step": 790 }, { "epoch": 6.412825651302605, "grad_norm": 29.61836051940918, "learning_rate": 3.763440860215054e-05, "loss": 0.5877, "step": 800 }, { "epoch": 6.492985971943888, "grad_norm": 25.111305236816406, "learning_rate": 3.741039426523298e-05, "loss": 0.5988, "step": 810 }, { "epoch": 6.573146292585171, "grad_norm": 31.161901473999023, "learning_rate": 3.718637992831541e-05, "loss": 0.6037, "step": 820 }, { "epoch": 6.653306613226453, "grad_norm": 28.766677856445312, "learning_rate": 3.696236559139785e-05, "loss": 0.6522, "step": 830 }, { "epoch": 6.733466933867735, "grad_norm": 25.192800521850586, "learning_rate": 3.673835125448029e-05, "loss": 0.6015, "step": 840 }, { "epoch": 6.813627254509018, "grad_norm": 32.32080841064453, "learning_rate": 3.651433691756273e-05, "loss": 0.6019, "step": 850 }, { "epoch": 6.8937875751503, "grad_norm": 52.31047439575195, "learning_rate": 3.6290322580645165e-05, "loss": 0.6806, "step": 860 }, { "epoch": 6.973947895791583, "grad_norm": 25.50705337524414, "learning_rate": 3.60663082437276e-05, "loss": 0.6227, "step": 870 }, { "epoch": 6.997995991983968, "eval_accuracy": 0.8134160090191658, "eval_loss": 0.5536333322525024, "eval_runtime": 16.6273, "eval_samples_per_second": 106.692, "eval_steps_per_second": 3.368, "step": 873 }, { "epoch": 7.054108216432866, "grad_norm": 19.30939292907715, "learning_rate": 3.5842293906810036e-05, "loss": 0.5791, "step": 880 }, { "epoch": 7.134268537074148, "grad_norm": 32.007442474365234, "learning_rate": 3.5618279569892474e-05, "loss": 0.6049, "step": 890 }, { "epoch": 7.214428857715431, "grad_norm": 22.92371940612793, "learning_rate": 3.539426523297491e-05, "loss": 0.6177, "step": 900 }, { "epoch": 7.294589178356714, "grad_norm": 42.33139419555664, "learning_rate": 3.517025089605735e-05, "loss": 0.5685, "step": 910 }, { "epoch": 7.374749498997996, "grad_norm": 23.081684112548828, "learning_rate": 3.494623655913979e-05, "loss": 0.5914, "step": 920 }, { "epoch": 7.454909819639279, "grad_norm": 28.351613998413086, "learning_rate": 3.472222222222222e-05, "loss": 0.6416, "step": 930 }, { "epoch": 7.5350701402805615, "grad_norm": 29.150787353515625, "learning_rate": 3.449820788530466e-05, "loss": 0.5786, "step": 940 }, { "epoch": 7.615230460921843, "grad_norm": 25.98550796508789, "learning_rate": 3.427419354838709e-05, "loss": 0.6198, "step": 950 }, { "epoch": 7.695390781563126, "grad_norm": 17.467369079589844, "learning_rate": 3.405017921146954e-05, "loss": 0.5378, "step": 960 }, { "epoch": 7.775551102204409, "grad_norm": 27.1838436126709, "learning_rate": 3.382616487455198e-05, "loss": 0.607, "step": 970 }, { "epoch": 7.855711422845691, "grad_norm": 38.05990219116211, "learning_rate": 3.360215053763441e-05, "loss": 0.5962, "step": 980 }, { "epoch": 7.935871743486974, "grad_norm": 42.47323989868164, "learning_rate": 3.337813620071685e-05, "loss": 0.6067, "step": 990 }, { "epoch": 8.0, "eval_accuracy": 0.818489289740699, "eval_loss": 0.5687136650085449, "eval_runtime": 16.5686, "eval_samples_per_second": 107.07, "eval_steps_per_second": 3.38, "step": 998 }, { "epoch": 8.016032064128256, "grad_norm": 37.08466339111328, "learning_rate": 3.3154121863799286e-05, "loss": 0.5576, "step": 1000 }, { "epoch": 8.09619238476954, "grad_norm": 23.092058181762695, "learning_rate": 3.293010752688172e-05, "loss": 0.5297, "step": 1010 }, { "epoch": 8.176352705410821, "grad_norm": 21.885507583618164, "learning_rate": 3.270609318996416e-05, "loss": 0.5678, "step": 1020 }, { "epoch": 8.256513026052104, "grad_norm": 29.588977813720703, "learning_rate": 3.24820788530466e-05, "loss": 0.5404, "step": 1030 }, { "epoch": 8.336673346693386, "grad_norm": 58.07012176513672, "learning_rate": 3.2258064516129034e-05, "loss": 0.535, "step": 1040 }, { "epoch": 8.41683366733467, "grad_norm": 25.207134246826172, "learning_rate": 3.203405017921147e-05, "loss": 0.5622, "step": 1050 }, { "epoch": 8.496993987975952, "grad_norm": 27.917211532592773, "learning_rate": 3.1810035842293904e-05, "loss": 0.5527, "step": 1060 }, { "epoch": 8.577154308617235, "grad_norm": 17.901611328125, "learning_rate": 3.158602150537634e-05, "loss": 0.5115, "step": 1070 }, { "epoch": 8.657314629258517, "grad_norm": 20.38230323791504, "learning_rate": 3.136200716845878e-05, "loss": 0.608, "step": 1080 }, { "epoch": 8.7374749498998, "grad_norm": 22.982620239257812, "learning_rate": 3.113799283154122e-05, "loss": 0.5628, "step": 1090 }, { "epoch": 8.817635270541082, "grad_norm": 22.212993621826172, "learning_rate": 3.091397849462366e-05, "loss": 0.5076, "step": 1100 }, { "epoch": 8.897795591182366, "grad_norm": 28.16684913635254, "learning_rate": 3.06899641577061e-05, "loss": 0.5299, "step": 1110 }, { "epoch": 8.977955911823647, "grad_norm": 19.41158676147461, "learning_rate": 3.046594982078853e-05, "loss": 0.533, "step": 1120 }, { "epoch": 8.993987975951903, "eval_accuracy": 0.8089064261555806, "eval_loss": 0.5918950438499451, "eval_runtime": 16.6888, "eval_samples_per_second": 106.299, "eval_steps_per_second": 3.356, "step": 1122 }, { "epoch": 9.05811623246493, "grad_norm": 25.26761245727539, "learning_rate": 3.024193548387097e-05, "loss": 0.6276, "step": 1130 }, { "epoch": 9.138276553106213, "grad_norm": 18.059947967529297, "learning_rate": 3.0017921146953403e-05, "loss": 0.5674, "step": 1140 }, { "epoch": 9.218436873747494, "grad_norm": 37.593204498291016, "learning_rate": 2.979390681003584e-05, "loss": 0.5189, "step": 1150 }, { "epoch": 9.298597194388778, "grad_norm": 24.151836395263672, "learning_rate": 2.9569892473118284e-05, "loss": 0.5086, "step": 1160 }, { "epoch": 9.37875751503006, "grad_norm": 30.97344970703125, "learning_rate": 2.9345878136200715e-05, "loss": 0.5137, "step": 1170 }, { "epoch": 9.458917835671343, "grad_norm": 20.18236541748047, "learning_rate": 2.9121863799283154e-05, "loss": 0.4647, "step": 1180 }, { "epoch": 9.539078156312625, "grad_norm": 23.9926815032959, "learning_rate": 2.8897849462365596e-05, "loss": 0.5529, "step": 1190 }, { "epoch": 9.619238476953909, "grad_norm": 31.676265716552734, "learning_rate": 2.8673835125448028e-05, "loss": 0.5256, "step": 1200 }, { "epoch": 9.69939879759519, "grad_norm": 27.958820343017578, "learning_rate": 2.8449820788530467e-05, "loss": 0.5121, "step": 1210 }, { "epoch": 9.779559118236474, "grad_norm": 21.766021728515625, "learning_rate": 2.822580645161291e-05, "loss": 0.5393, "step": 1220 }, { "epoch": 9.859719438877756, "grad_norm": 24.838279724121094, "learning_rate": 2.800179211469534e-05, "loss": 0.478, "step": 1230 }, { "epoch": 9.939879759519037, "grad_norm": 36.48131561279297, "learning_rate": 2.777777777777778e-05, "loss": 0.5423, "step": 1240 }, { "epoch": 9.995991983967937, "eval_accuracy": 0.8370913190529876, "eval_loss": 0.502169668674469, "eval_runtime": 16.5887, "eval_samples_per_second": 106.94, "eval_steps_per_second": 3.376, "step": 1247 }, { "epoch": 10.02004008016032, "grad_norm": 19.24289894104004, "learning_rate": 2.7553763440860214e-05, "loss": 0.5308, "step": 1250 }, { "epoch": 10.100200400801603, "grad_norm": 19.2097110748291, "learning_rate": 2.7329749103942653e-05, "loss": 0.4834, "step": 1260 }, { "epoch": 10.180360721442886, "grad_norm": 26.702028274536133, "learning_rate": 2.710573476702509e-05, "loss": 0.4799, "step": 1270 }, { "epoch": 10.260521042084168, "grad_norm": 22.088153839111328, "learning_rate": 2.6881720430107527e-05, "loss": 0.5015, "step": 1280 }, { "epoch": 10.340681362725451, "grad_norm": 24.59678840637207, "learning_rate": 2.6657706093189965e-05, "loss": 0.5561, "step": 1290 }, { "epoch": 10.420841683366733, "grad_norm": 26.353622436523438, "learning_rate": 2.6433691756272404e-05, "loss": 0.479, "step": 1300 }, { "epoch": 10.501002004008017, "grad_norm": 21.673465728759766, "learning_rate": 2.620967741935484e-05, "loss": 0.5283, "step": 1310 }, { "epoch": 10.581162324649299, "grad_norm": 20.873695373535156, "learning_rate": 2.5985663082437278e-05, "loss": 0.4894, "step": 1320 }, { "epoch": 10.661322645290582, "grad_norm": 31.117862701416016, "learning_rate": 2.5761648745519713e-05, "loss": 0.4783, "step": 1330 }, { "epoch": 10.741482965931864, "grad_norm": 25.72610855102539, "learning_rate": 2.5537634408602152e-05, "loss": 0.4997, "step": 1340 }, { "epoch": 10.821643286573146, "grad_norm": 22.59943389892578, "learning_rate": 2.531362007168459e-05, "loss": 0.4518, "step": 1350 }, { "epoch": 10.901803607214429, "grad_norm": 17.32233428955078, "learning_rate": 2.5089605734767026e-05, "loss": 0.5313, "step": 1360 }, { "epoch": 10.98196392785571, "grad_norm": 20.552576065063477, "learning_rate": 2.4865591397849464e-05, "loss": 0.4747, "step": 1370 }, { "epoch": 10.997995991983968, "eval_accuracy": 0.8579481397970687, "eval_loss": 0.4419150948524475, "eval_runtime": 16.5822, "eval_samples_per_second": 106.982, "eval_steps_per_second": 3.377, "step": 1372 }, { "epoch": 11.062124248496994, "grad_norm": 20.17671775817871, "learning_rate": 2.46415770609319e-05, "loss": 0.4848, "step": 1380 }, { "epoch": 11.142284569138276, "grad_norm": 21.159685134887695, "learning_rate": 2.4417562724014338e-05, "loss": 0.4434, "step": 1390 }, { "epoch": 11.22244488977956, "grad_norm": 21.114795684814453, "learning_rate": 2.4193548387096777e-05, "loss": 0.4665, "step": 1400 }, { "epoch": 11.302605210420841, "grad_norm": 16.447608947753906, "learning_rate": 2.3969534050179212e-05, "loss": 0.45, "step": 1410 }, { "epoch": 11.382765531062125, "grad_norm": 34.57737731933594, "learning_rate": 2.374551971326165e-05, "loss": 0.5073, "step": 1420 }, { "epoch": 11.462925851703407, "grad_norm": 22.9782657623291, "learning_rate": 2.352150537634409e-05, "loss": 0.5099, "step": 1430 }, { "epoch": 11.54308617234469, "grad_norm": 30.651418685913086, "learning_rate": 2.3297491039426525e-05, "loss": 0.473, "step": 1440 }, { "epoch": 11.623246492985972, "grad_norm": 21.65363883972168, "learning_rate": 2.307347670250896e-05, "loss": 0.4723, "step": 1450 }, { "epoch": 11.703406813627254, "grad_norm": 20.955120086669922, "learning_rate": 2.28494623655914e-05, "loss": 0.4613, "step": 1460 }, { "epoch": 11.783567134268537, "grad_norm": 21.71851921081543, "learning_rate": 2.2625448028673837e-05, "loss": 0.4884, "step": 1470 }, { "epoch": 11.863727454909819, "grad_norm": 29.572038650512695, "learning_rate": 2.2401433691756272e-05, "loss": 0.4842, "step": 1480 }, { "epoch": 11.943887775551103, "grad_norm": 30.581287384033203, "learning_rate": 2.217741935483871e-05, "loss": 0.4367, "step": 1490 }, { "epoch": 12.0, "eval_accuracy": 0.8207440811724915, "eval_loss": 0.5084114074707031, "eval_runtime": 16.5802, "eval_samples_per_second": 106.995, "eval_steps_per_second": 3.378, "step": 1497 }, { "epoch": 12.024048096192384, "grad_norm": 21.091793060302734, "learning_rate": 2.1953405017921146e-05, "loss": 0.4565, "step": 1500 }, { "epoch": 12.104208416833668, "grad_norm": 16.97905921936035, "learning_rate": 2.1729390681003585e-05, "loss": 0.4488, "step": 1510 }, { "epoch": 12.18436873747495, "grad_norm": 20.028785705566406, "learning_rate": 2.1505376344086024e-05, "loss": 0.4109, "step": 1520 }, { "epoch": 12.264529058116233, "grad_norm": 26.391056060791016, "learning_rate": 2.128136200716846e-05, "loss": 0.4572, "step": 1530 }, { "epoch": 12.344689378757515, "grad_norm": 24.771198272705078, "learning_rate": 2.1057347670250897e-05, "loss": 0.4693, "step": 1540 }, { "epoch": 12.424849699398798, "grad_norm": 21.671600341796875, "learning_rate": 2.0833333333333336e-05, "loss": 0.4446, "step": 1550 }, { "epoch": 12.50501002004008, "grad_norm": 29.258426666259766, "learning_rate": 2.060931899641577e-05, "loss": 0.4272, "step": 1560 }, { "epoch": 12.585170340681362, "grad_norm": 22.518234252929688, "learning_rate": 2.038530465949821e-05, "loss": 0.4732, "step": 1570 }, { "epoch": 12.665330661322646, "grad_norm": 22.70555877685547, "learning_rate": 2.0161290322580645e-05, "loss": 0.4416, "step": 1580 }, { "epoch": 12.745490981963927, "grad_norm": 19.604206085205078, "learning_rate": 1.9937275985663084e-05, "loss": 0.4517, "step": 1590 }, { "epoch": 12.82565130260521, "grad_norm": 21.88922119140625, "learning_rate": 1.9713261648745522e-05, "loss": 0.4458, "step": 1600 }, { "epoch": 12.905811623246493, "grad_norm": 23.284523010253906, "learning_rate": 1.9489247311827958e-05, "loss": 0.4362, "step": 1610 }, { "epoch": 12.985971943887776, "grad_norm": 16.729040145874023, "learning_rate": 1.9265232974910393e-05, "loss": 0.4907, "step": 1620 }, { "epoch": 12.993987975951903, "eval_accuracy": 0.8365276211950394, "eval_loss": 0.4773949980735779, "eval_runtime": 16.6122, "eval_samples_per_second": 106.789, "eval_steps_per_second": 3.371, "step": 1621 }, { "epoch": 13.066132264529058, "grad_norm": 20.915903091430664, "learning_rate": 1.904121863799283e-05, "loss": 0.4276, "step": 1630 }, { "epoch": 13.146292585170341, "grad_norm": 22.99505615234375, "learning_rate": 1.881720430107527e-05, "loss": 0.4026, "step": 1640 }, { "epoch": 13.226452905811623, "grad_norm": 37.65668487548828, "learning_rate": 1.8593189964157705e-05, "loss": 0.4015, "step": 1650 }, { "epoch": 13.306613226452907, "grad_norm": 26.193628311157227, "learning_rate": 1.8369175627240144e-05, "loss": 0.4238, "step": 1660 }, { "epoch": 13.386773547094188, "grad_norm": 29.185932159423828, "learning_rate": 1.8145161290322583e-05, "loss": 0.4483, "step": 1670 }, { "epoch": 13.46693386773547, "grad_norm": 30.262685775756836, "learning_rate": 1.7921146953405018e-05, "loss": 0.3972, "step": 1680 }, { "epoch": 13.547094188376754, "grad_norm": 17.25420570373535, "learning_rate": 1.7697132616487457e-05, "loss": 0.416, "step": 1690 }, { "epoch": 13.627254509018035, "grad_norm": 15.789755821228027, "learning_rate": 1.7473118279569895e-05, "loss": 0.3985, "step": 1700 }, { "epoch": 13.707414829659319, "grad_norm": 24.821426391601562, "learning_rate": 1.724910394265233e-05, "loss": 0.4334, "step": 1710 }, { "epoch": 13.7875751503006, "grad_norm": 19.373735427856445, "learning_rate": 1.702508960573477e-05, "loss": 0.4494, "step": 1720 }, { "epoch": 13.867735470941884, "grad_norm": 39.24836730957031, "learning_rate": 1.6801075268817204e-05, "loss": 0.4272, "step": 1730 }, { "epoch": 13.947895791583166, "grad_norm": 18.48973846435547, "learning_rate": 1.6577060931899643e-05, "loss": 0.4269, "step": 1740 }, { "epoch": 13.995991983967937, "eval_accuracy": 0.8444193912063134, "eval_loss": 0.49454447627067566, "eval_runtime": 16.5837, "eval_samples_per_second": 106.972, "eval_steps_per_second": 3.377, "step": 1746 }, { "epoch": 14.02805611222445, "grad_norm": 19.31620979309082, "learning_rate": 1.635304659498208e-05, "loss": 0.4197, "step": 1750 }, { "epoch": 14.108216432865731, "grad_norm": 22.901765823364258, "learning_rate": 1.6129032258064517e-05, "loss": 0.4569, "step": 1760 }, { "epoch": 14.188376753507015, "grad_norm": 18.342025756835938, "learning_rate": 1.5905017921146952e-05, "loss": 0.3897, "step": 1770 }, { "epoch": 14.268537074148297, "grad_norm": 20.735149383544922, "learning_rate": 1.568100358422939e-05, "loss": 0.375, "step": 1780 }, { "epoch": 14.348697394789578, "grad_norm": 16.781925201416016, "learning_rate": 1.545698924731183e-05, "loss": 0.3994, "step": 1790 }, { "epoch": 14.428857715430862, "grad_norm": 20.588781356811523, "learning_rate": 1.5232974910394265e-05, "loss": 0.4204, "step": 1800 }, { "epoch": 14.509018036072144, "grad_norm": 23.370925903320312, "learning_rate": 1.5008960573476701e-05, "loss": 0.4208, "step": 1810 }, { "epoch": 14.589178356713427, "grad_norm": 30.92214584350586, "learning_rate": 1.4784946236559142e-05, "loss": 0.3962, "step": 1820 }, { "epoch": 14.669338677354709, "grad_norm": 18.780115127563477, "learning_rate": 1.4560931899641577e-05, "loss": 0.3895, "step": 1830 }, { "epoch": 14.749498997995993, "grad_norm": 23.375154495239258, "learning_rate": 1.4336917562724014e-05, "loss": 0.3871, "step": 1840 }, { "epoch": 14.829659318637274, "grad_norm": 18.720943450927734, "learning_rate": 1.4112903225806454e-05, "loss": 0.4037, "step": 1850 }, { "epoch": 14.909819639278558, "grad_norm": 27.814115524291992, "learning_rate": 1.388888888888889e-05, "loss": 0.409, "step": 1860 }, { "epoch": 14.98997995991984, "grad_norm": 23.809450149536133, "learning_rate": 1.3664874551971326e-05, "loss": 0.3787, "step": 1870 }, { "epoch": 14.997995991983968, "eval_accuracy": 0.8478015783540023, "eval_loss": 0.47794777154922485, "eval_runtime": 16.5967, "eval_samples_per_second": 106.889, "eval_steps_per_second": 3.374, "step": 1871 }, { "epoch": 15.070140280561123, "grad_norm": 18.869020462036133, "learning_rate": 1.3440860215053763e-05, "loss": 0.3546, "step": 1880 }, { "epoch": 15.150300601202405, "grad_norm": 18.53528594970703, "learning_rate": 1.3216845878136202e-05, "loss": 0.3785, "step": 1890 }, { "epoch": 15.230460921843687, "grad_norm": 25.095417022705078, "learning_rate": 1.2992831541218639e-05, "loss": 0.3921, "step": 1900 }, { "epoch": 15.31062124248497, "grad_norm": 19.071975708007812, "learning_rate": 1.2768817204301076e-05, "loss": 0.3924, "step": 1910 }, { "epoch": 15.390781563126252, "grad_norm": 20.396650314331055, "learning_rate": 1.2544802867383513e-05, "loss": 0.3877, "step": 1920 }, { "epoch": 15.470941883767535, "grad_norm": 26.20189094543457, "learning_rate": 1.232078853046595e-05, "loss": 0.3408, "step": 1930 }, { "epoch": 15.551102204408817, "grad_norm": 25.609718322753906, "learning_rate": 1.2096774193548388e-05, "loss": 0.3554, "step": 1940 }, { "epoch": 15.6312625250501, "grad_norm": 22.300622940063477, "learning_rate": 1.1872759856630825e-05, "loss": 0.3408, "step": 1950 }, { "epoch": 15.711422845691382, "grad_norm": 21.68485450744629, "learning_rate": 1.1648745519713262e-05, "loss": 0.3534, "step": 1960 }, { "epoch": 15.791583166332666, "grad_norm": 21.4897518157959, "learning_rate": 1.14247311827957e-05, "loss": 0.3685, "step": 1970 }, { "epoch": 15.871743486973948, "grad_norm": 23.654247283935547, "learning_rate": 1.1200716845878136e-05, "loss": 0.4038, "step": 1980 }, { "epoch": 15.951903807615231, "grad_norm": 24.013349533081055, "learning_rate": 1.0976702508960573e-05, "loss": 0.3724, "step": 1990 }, { "epoch": 16.0, "eval_accuracy": 0.8714768883878241, "eval_loss": 0.42415910959243774, "eval_runtime": 16.5547, "eval_samples_per_second": 107.16, "eval_steps_per_second": 3.383, "step": 1996 }, { "epoch": 16.03206412825651, "grad_norm": 19.16984748840332, "learning_rate": 1.0752688172043012e-05, "loss": 0.3318, "step": 2000 }, { "epoch": 16.112224448897795, "grad_norm": 25.895992279052734, "learning_rate": 1.0528673835125449e-05, "loss": 0.3325, "step": 2010 }, { "epoch": 16.19238476953908, "grad_norm": 22.940765380859375, "learning_rate": 1.0304659498207886e-05, "loss": 0.3384, "step": 2020 }, { "epoch": 16.272545090180362, "grad_norm": 16.38847541809082, "learning_rate": 1.0080645161290323e-05, "loss": 0.3289, "step": 2030 }, { "epoch": 16.352705410821642, "grad_norm": 15.670344352722168, "learning_rate": 9.856630824372761e-06, "loss": 0.3289, "step": 2040 }, { "epoch": 16.432865731462925, "grad_norm": 16.80422019958496, "learning_rate": 9.632616487455196e-06, "loss": 0.3387, "step": 2050 }, { "epoch": 16.51302605210421, "grad_norm": 17.764860153198242, "learning_rate": 9.408602150537635e-06, "loss": 0.3979, "step": 2060 }, { "epoch": 16.593186372745492, "grad_norm": 18.336956024169922, "learning_rate": 9.184587813620072e-06, "loss": 0.4002, "step": 2070 }, { "epoch": 16.673346693386772, "grad_norm": 16.667768478393555, "learning_rate": 8.960573476702509e-06, "loss": 0.3401, "step": 2080 }, { "epoch": 16.753507014028056, "grad_norm": 15.367273330688477, "learning_rate": 8.736559139784948e-06, "loss": 0.4036, "step": 2090 }, { "epoch": 16.83366733466934, "grad_norm": 22.505924224853516, "learning_rate": 8.512544802867385e-06, "loss": 0.3287, "step": 2100 }, { "epoch": 16.91382765531062, "grad_norm": 17.039518356323242, "learning_rate": 8.288530465949821e-06, "loss": 0.3105, "step": 2110 }, { "epoch": 16.993987975951903, "grad_norm": 24.149869918823242, "learning_rate": 8.064516129032258e-06, "loss": 0.354, "step": 2120 }, { "epoch": 16.993987975951903, "eval_accuracy": 0.8528748590755355, "eval_loss": 0.45945772528648376, "eval_runtime": 16.5552, "eval_samples_per_second": 107.156, "eval_steps_per_second": 3.383, "step": 2120 }, { "epoch": 17.074148296593187, "grad_norm": 23.050357818603516, "learning_rate": 7.840501792114695e-06, "loss": 0.3518, "step": 2130 }, { "epoch": 17.15430861723447, "grad_norm": 17.574710845947266, "learning_rate": 7.616487455197132e-06, "loss": 0.3024, "step": 2140 }, { "epoch": 17.23446893787575, "grad_norm": 27.150959014892578, "learning_rate": 7.392473118279571e-06, "loss": 0.3482, "step": 2150 }, { "epoch": 17.314629258517034, "grad_norm": 25.671140670776367, "learning_rate": 7.168458781362007e-06, "loss": 0.3461, "step": 2160 }, { "epoch": 17.394789579158317, "grad_norm": 23.23431968688965, "learning_rate": 6.944444444444445e-06, "loss": 0.3502, "step": 2170 }, { "epoch": 17.4749498997996, "grad_norm": 25.251195907592773, "learning_rate": 6.720430107526882e-06, "loss": 0.3505, "step": 2180 }, { "epoch": 17.55511022044088, "grad_norm": 18.16839027404785, "learning_rate": 6.4964157706093195e-06, "loss": 0.3401, "step": 2190 }, { "epoch": 17.635270541082164, "grad_norm": 17.334238052368164, "learning_rate": 6.2724014336917564e-06, "loss": 0.3182, "step": 2200 }, { "epoch": 17.715430861723448, "grad_norm": 23.427167892456055, "learning_rate": 6.048387096774194e-06, "loss": 0.2838, "step": 2210 }, { "epoch": 17.79559118236473, "grad_norm": 25.35896110534668, "learning_rate": 5.824372759856631e-06, "loss": 0.31, "step": 2220 }, { "epoch": 17.87575150300601, "grad_norm": 22.105119705200195, "learning_rate": 5.600358422939068e-06, "loss": 0.3231, "step": 2230 }, { "epoch": 17.955911823647295, "grad_norm": 33.91689682006836, "learning_rate": 5.376344086021506e-06, "loss": 0.3304, "step": 2240 }, { "epoch": 17.995991983967937, "eval_accuracy": 0.8562570462232244, "eval_loss": 0.45429307222366333, "eval_runtime": 17.1301, "eval_samples_per_second": 103.561, "eval_steps_per_second": 3.269, "step": 2245 }, { "epoch": 18.03607214428858, "grad_norm": 17.72748374938965, "learning_rate": 5.152329749103943e-06, "loss": 0.3273, "step": 2250 }, { "epoch": 18.11623246492986, "grad_norm": 15.051362991333008, "learning_rate": 4.928315412186381e-06, "loss": 0.2583, "step": 2260 }, { "epoch": 18.196392785571142, "grad_norm": 15.139473915100098, "learning_rate": 4.7043010752688175e-06, "loss": 0.2711, "step": 2270 }, { "epoch": 18.276553106212425, "grad_norm": 23.201719284057617, "learning_rate": 4.4802867383512545e-06, "loss": 0.3015, "step": 2280 }, { "epoch": 18.35671342685371, "grad_norm": 30.31717872619629, "learning_rate": 4.256272401433692e-06, "loss": 0.321, "step": 2290 }, { "epoch": 18.43687374749499, "grad_norm": 21.005664825439453, "learning_rate": 4.032258064516129e-06, "loss": 0.2887, "step": 2300 }, { "epoch": 18.517034068136272, "grad_norm": 14.847023963928223, "learning_rate": 3.808243727598566e-06, "loss": 0.2821, "step": 2310 }, { "epoch": 18.597194388777556, "grad_norm": 20.269981384277344, "learning_rate": 3.5842293906810035e-06, "loss": 0.2938, "step": 2320 }, { "epoch": 18.677354709418836, "grad_norm": 18.943777084350586, "learning_rate": 3.360215053763441e-06, "loss": 0.2936, "step": 2330 }, { "epoch": 18.75751503006012, "grad_norm": 17.58114242553711, "learning_rate": 3.1362007168458782e-06, "loss": 0.2801, "step": 2340 }, { "epoch": 18.837675350701403, "grad_norm": 18.223283767700195, "learning_rate": 2.9121863799283156e-06, "loss": 0.2792, "step": 2350 }, { "epoch": 18.917835671342687, "grad_norm": 19.910411834716797, "learning_rate": 2.688172043010753e-06, "loss": 0.3133, "step": 2360 }, { "epoch": 18.997995991983966, "grad_norm": 18.749902725219727, "learning_rate": 2.4641577060931903e-06, "loss": 0.2849, "step": 2370 }, { "epoch": 18.997995991983966, "eval_accuracy": 0.8607666290868095, "eval_loss": 0.4698711931705475, "eval_runtime": 16.5801, "eval_samples_per_second": 106.996, "eval_steps_per_second": 3.378, "step": 2370 }, { "epoch": 19.07815631262525, "grad_norm": 16.722375869750977, "learning_rate": 2.2401433691756272e-06, "loss": 0.3041, "step": 2380 }, { "epoch": 19.158316633266534, "grad_norm": 23.04833221435547, "learning_rate": 2.0161290322580646e-06, "loss": 0.301, "step": 2390 }, { "epoch": 19.238476953907817, "grad_norm": 20.021759033203125, "learning_rate": 1.7921146953405017e-06, "loss": 0.2723, "step": 2400 }, { "epoch": 19.318637274549097, "grad_norm": 18.34398078918457, "learning_rate": 1.5681003584229391e-06, "loss": 0.2484, "step": 2410 }, { "epoch": 19.39879759519038, "grad_norm": 18.83694839477539, "learning_rate": 1.3440860215053765e-06, "loss": 0.3204, "step": 2420 }, { "epoch": 19.478957915831664, "grad_norm": 16.674007415771484, "learning_rate": 1.1200716845878136e-06, "loss": 0.3034, "step": 2430 }, { "epoch": 19.559118236472948, "grad_norm": 28.710783004760742, "learning_rate": 8.960573476702509e-07, "loss": 0.3078, "step": 2440 }, { "epoch": 19.639278557114228, "grad_norm": 18.92753791809082, "learning_rate": 6.720430107526882e-07, "loss": 0.2644, "step": 2450 }, { "epoch": 19.71943887775551, "grad_norm": 17.13638687133789, "learning_rate": 4.4802867383512544e-07, "loss": 0.2673, "step": 2460 }, { "epoch": 19.799599198396795, "grad_norm": 18.35292625427246, "learning_rate": 2.2401433691756272e-07, "loss": 0.2569, "step": 2470 }, { "epoch": 19.879759519038075, "grad_norm": 18.86850929260254, "learning_rate": 0.0, "loss": 0.2456, "step": 2480 }, { "epoch": 19.879759519038075, "eval_accuracy": 0.8664036076662909, "eval_loss": 0.4505213499069214, "eval_runtime": 16.6944, "eval_samples_per_second": 106.263, "eval_steps_per_second": 3.354, "step": 2480 }, { "epoch": 19.879759519038075, "step": 2480, "total_flos": 8.259382470828884e+18, "train_loss": 0.5719680822664691, "train_runtime": 7149.119, "train_samples_per_second": 44.654, "train_steps_per_second": 0.347 } ], "logging_steps": 10, "max_steps": 2480, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "total_flos": 8.259382470828884e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }