Think2SQL-14B / trainer_state.json
anonymous-2321's picture
Commit folder
27de8e9 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.999559277214632,
"eval_steps": 500,
"global_step": 567,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 477.0,
"completions/max_terminated_length": 477.0,
"completions/mean_length": 175.50390625,
"completions/mean_terminated_length": 175.50390625,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.0017628911414720142,
"grad_norm": 1.0880173896572545,
"kl": 0.0,
"learning_rate": 0.0,
"loss": -0.327,
"num_tokens": 129409.0,
"reward": 0.814777672290802,
"reward_std": 0.14736539125442505,
"rewards/format_reward/mean": 0.68359375,
"rewards/format_reward/std": 0.4659844934940338,
"rewards/qatch_metrics/mean": 0.8332747220993042,
"rewards/qatch_metrics/std": 0.3284282088279724,
"rewards/tag_count_reward/mean": 0.7626953125,
"rewards/tag_count_reward/std": 0.34948837757110596,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 421.0,
"completions/max_terminated_length": 421.0,
"completions/mean_length": 177.318359375,
"completions/mean_terminated_length": 177.318359375,
"completions/min_length": 21.5,
"completions/min_terminated_length": 21.5,
"epoch": 0.00881445570736007,
"grad_norm": 0.9499188530188546,
"kl": 0.00019824504852294922,
"learning_rate": 7.017543859649122e-08,
"loss": -0.2902,
"num_tokens": 685703.0,
"reward": 0.762174516916275,
"reward_std": 0.15002675727009773,
"rewards/format_reward/mean": 0.7265625,
"rewards/format_reward/std": 0.4450720399618149,
"rewards/qatch_metrics/mean": 0.7644235193729401,
"rewards/qatch_metrics/std": 0.3610532283782959,
"rewards/tag_count_reward/mean": 0.795166015625,
"rewards/tag_count_reward/std": 0.33385463058948517,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 438.8,
"completions/max_terminated_length": 438.8,
"completions/mean_length": 173.41171875,
"completions/mean_terminated_length": 173.41171875,
"completions/min_length": 21.8,
"completions/min_terminated_length": 21.8,
"epoch": 0.01762891141472014,
"grad_norm": 0.9346895582900878,
"kl": 0.00028295516967773436,
"learning_rate": 1.5789473684210525e-07,
"loss": -0.2591,
"num_tokens": 1398566.0,
"reward": 0.7710299372673035,
"reward_std": 0.1539353460073471,
"rewards/format_reward/mean": 0.71796875,
"rewards/format_reward/std": 0.4487275779247284,
"rewards/qatch_metrics/mean": 0.7762346506118775,
"rewards/qatch_metrics/std": 0.3281721532344818,
"rewards/tag_count_reward/mean": 0.788671875,
"rewards/tag_count_reward/std": 0.33627479076385497,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 438.2,
"completions/max_terminated_length": 438.2,
"completions/mean_length": 183.1796875,
"completions/mean_terminated_length": 183.1796875,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.026443367122080213,
"grad_norm": 0.7943318239924386,
"kl": 0.00037631988525390627,
"learning_rate": 2.456140350877193e-07,
"loss": -0.2603,
"num_tokens": 2071996.0,
"reward": 0.7256837129592896,
"reward_std": 0.12991088777780532,
"rewards/format_reward/mean": 0.765625,
"rewards/format_reward/std": 0.4240167737007141,
"rewards/qatch_metrics/mean": 0.7151770830154419,
"rewards/qatch_metrics/std": 0.37596395611763,
"rewards/tag_count_reward/mean": 0.8244140625,
"rewards/tag_count_reward/std": 0.31790287494659425,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 479.6,
"completions/max_terminated_length": 479.6,
"completions/mean_length": 201.30234375,
"completions/mean_terminated_length": 201.30234375,
"completions/min_length": 21.2,
"completions/min_terminated_length": 21.2,
"epoch": 0.03525782282944028,
"grad_norm": 0.4721344642723057,
"kl": 0.00091400146484375,
"learning_rate": 3.333333333333333e-07,
"loss": -0.1315,
"num_tokens": 2791247.0,
"reward": 0.8173989057540894,
"reward_std": 0.12794919013977052,
"rewards/format_reward/mean": 0.89765625,
"rewards/format_reward/std": 0.29814977645874025,
"rewards/qatch_metrics/mean": 0.8017192721366883,
"rewards/qatch_metrics/std": 0.331482595205307,
"rewards/tag_count_reward/mean": 0.9234375,
"rewards/tag_count_reward/std": 0.22307254374027252,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 432.2,
"completions/max_terminated_length": 432.2,
"completions/mean_length": 221.4625,
"completions/mean_terminated_length": 221.4625,
"completions/min_length": 51.4,
"completions/min_terminated_length": 51.4,
"epoch": 0.044072278536800354,
"grad_norm": 0.29592079815815686,
"kl": 0.0016038894653320312,
"learning_rate": 4.2105263157894733e-07,
"loss": -0.0424,
"num_tokens": 3536975.0,
"reward": 0.7564297676086426,
"reward_std": 0.08200130835175515,
"rewards/format_reward/mean": 0.96953125,
"rewards/format_reward/std": 0.13422587364912034,
"rewards/qatch_metrics/mean": 0.7183640837669373,
"rewards/qatch_metrics/std": 0.3674669623374939,
"rewards/tag_count_reward/mean": 0.97734375,
"rewards/tag_count_reward/std": 0.09909781143069267,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 445.6,
"completions/max_terminated_length": 445.6,
"completions/mean_length": 216.53984375,
"completions/mean_terminated_length": 216.53984375,
"completions/min_length": 77.0,
"completions/min_terminated_length": 77.0,
"epoch": 0.052886734244160426,
"grad_norm": 0.275794455786416,
"kl": 0.0034694671630859375,
"learning_rate": 5.087719298245614e-07,
"loss": 0.002,
"num_tokens": 4281330.0,
"reward": 0.7764788866043091,
"reward_std": 0.09769791960716248,
"rewards/format_reward/mean": 0.9953125,
"rewards/format_reward/std": 0.06028594672679901,
"rewards/qatch_metrics/mean": 0.7377692699432373,
"rewards/qatch_metrics/std": 0.3548368811607361,
"rewards/tag_count_reward/mean": 0.996875,
"rewards/tag_count_reward/std": 0.04124387204647064,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 445.8,
"completions/max_terminated_length": 445.8,
"completions/mean_length": 220.11796875,
"completions/mean_terminated_length": 220.11796875,
"completions/min_length": 59.8,
"completions/min_terminated_length": 59.8,
"epoch": 0.06170118995152049,
"grad_norm": 0.2691159080285212,
"kl": 0.005501174926757812,
"learning_rate": 5.964912280701754e-07,
"loss": -0.0083,
"num_tokens": 5008025.0,
"reward": 0.8268720507621765,
"reward_std": 0.08243840038776398,
"rewards/format_reward/mean": 0.99609375,
"rewards/format_reward/std": 0.0625,
"rewards/qatch_metrics/mean": 0.7969059944152832,
"rewards/qatch_metrics/std": 0.30500164330005647,
"rewards/tag_count_reward/mean": 0.9978515625,
"rewards/tag_count_reward/std": 0.03437500074505806,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 487.2,
"completions/max_terminated_length": 487.2,
"completions/mean_length": 227.76015625,
"completions/mean_terminated_length": 227.76015625,
"completions/min_length": 83.4,
"completions/min_terminated_length": 83.4,
"epoch": 0.07051564565888056,
"grad_norm": 0.33908836616855625,
"kl": 0.002800750732421875,
"learning_rate": 6.842105263157895e-07,
"loss": 0.0002,
"num_tokens": 5774806.0,
"reward": 0.7647829532623291,
"reward_std": 0.09533883556723595,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.025,
"rewards/qatch_metrics/mean": 0.7235268354415894,
"rewards/qatch_metrics/std": 0.35323665738105775,
"rewards/tag_count_reward/mean": 0.998828125,
"rewards/tag_count_reward/std": 0.01875,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 476.2,
"completions/max_terminated_length": 476.2,
"completions/mean_length": 221.7984375,
"completions/mean_terminated_length": 221.7984375,
"completions/min_length": 83.4,
"completions/min_terminated_length": 83.4,
"epoch": 0.07933010136624064,
"grad_norm": 0.3262303740341099,
"kl": 0.00310516357421875,
"learning_rate": 7.719298245614034e-07,
"loss": 0.0104,
"num_tokens": 6557268.0,
"reward": 0.7565465092658996,
"reward_std": 0.09911727011203766,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7135841250419617,
"rewards/qatch_metrics/std": 0.37862626910209657,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 512.8,
"completions/max_terminated_length": 512.8,
"completions/mean_length": 228.45546875,
"completions/mean_terminated_length": 228.45546875,
"completions/min_length": 76.0,
"completions/min_terminated_length": 76.0,
"epoch": 0.08814455707360071,
"grad_norm": 0.23276410584015308,
"kl": 0.00273895263671875,
"learning_rate": 8.596491228070175e-07,
"loss": -0.0018,
"num_tokens": 7327499.0,
"reward": 0.7988326072692871,
"reward_std": 0.06667622029781342,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.025,
"rewards/qatch_metrics/mean": 0.7635622501373291,
"rewards/qatch_metrics/std": 0.369570130109787,
"rewards/tag_count_reward/mean": 0.99921875,
"rewards/tag_count_reward/std": 0.0125,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 483.8,
"completions/max_terminated_length": 483.8,
"completions/mean_length": 220.52734375,
"completions/mean_terminated_length": 220.52734375,
"completions/min_length": 81.2,
"completions/min_terminated_length": 81.2,
"epoch": 0.09695901278096078,
"grad_norm": 0.28218074028465906,
"kl": 0.00196533203125,
"learning_rate": 9.473684210526315e-07,
"loss": -0.0021,
"num_tokens": 8077390.0,
"reward": 0.8159880757331848,
"reward_std": 0.10231453701853752,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7835153818130494,
"rewards/qatch_metrics/std": 0.33782891631126405,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 481.2,
"completions/max_terminated_length": 481.2,
"completions/mean_length": 223.60703125,
"completions/mean_terminated_length": 223.60703125,
"completions/min_length": 75.6,
"completions/min_terminated_length": 75.6,
"epoch": 0.10577346848832085,
"grad_norm": 0.23258401790732933,
"kl": 0.00223388671875,
"learning_rate": 1e-06,
"loss": -0.0045,
"num_tokens": 8800407.0,
"reward": 0.74871985912323,
"reward_std": 0.09312780797481537,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7043763160705566,
"rewards/qatch_metrics/std": 0.39227073788642886,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 487.8,
"completions/max_terminated_length": 487.8,
"completions/mean_length": 222.81015625,
"completions/mean_terminated_length": 222.81015625,
"completions/min_length": 77.4,
"completions/min_terminated_length": 77.4,
"epoch": 0.11458792419568092,
"grad_norm": 0.22445170455470606,
"kl": 0.002956390380859375,
"learning_rate": 1e-06,
"loss": 0.0057,
"num_tokens": 9557380.0,
"reward": 0.8077908515930176,
"reward_std": 0.09828853458166123,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.025,
"rewards/qatch_metrics/mean": 0.774078369140625,
"rewards/qatch_metrics/std": 0.33206661343574523,
"rewards/tag_count_reward/mean": 0.999609375,
"rewards/tag_count_reward/std": 0.00625,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 492.6,
"completions/max_terminated_length": 492.6,
"completions/mean_length": 231.83984375,
"completions/mean_terminated_length": 231.83984375,
"completions/min_length": 94.6,
"completions/min_terminated_length": 94.6,
"epoch": 0.12340237990304098,
"grad_norm": 0.22832903725685313,
"kl": 0.00381317138671875,
"learning_rate": 1e-06,
"loss": 0.0025,
"num_tokens": 10339127.0,
"reward": 0.7895300030708313,
"reward_std": 0.10415169298648834,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.025,
"rewards/qatch_metrics/mean": 0.7526065230369567,
"rewards/qatch_metrics/std": 0.3542828977108002,
"rewards/tag_count_reward/mean": 0.9994140625,
"rewards/tag_count_reward/std": 0.009375,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 521.8,
"completions/max_terminated_length": 521.8,
"completions/mean_length": 236.3125,
"completions/mean_terminated_length": 236.3125,
"completions/min_length": 80.4,
"completions/min_terminated_length": 80.4,
"epoch": 0.13221683561040107,
"grad_norm": 0.2597151805235052,
"kl": 0.00432281494140625,
"learning_rate": 1e-06,
"loss": 0.0083,
"num_tokens": 11147287.0,
"reward": 0.7333161950111389,
"reward_std": 0.08832715749740601,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.6862887978553772,
"rewards/qatch_metrics/std": 0.36336439847946167,
"rewards/tag_count_reward/mean": 0.9994140625,
"rewards/tag_count_reward/std": 0.0069767430424690245,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 445.6,
"completions/max_terminated_length": 445.6,
"completions/mean_length": 216.43984375,
"completions/mean_terminated_length": 216.43984375,
"completions/min_length": 87.8,
"completions/min_terminated_length": 87.8,
"epoch": 0.14103129131776113,
"grad_norm": 0.2463929158667687,
"kl": 0.00528717041015625,
"learning_rate": 1e-06,
"loss": 0.0044,
"num_tokens": 11891066.0,
"reward": 0.8300724029541016,
"reward_std": 0.09615504890680313,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8000851631164551,
"rewards/qatch_metrics/std": 0.3208737909793854,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 491.6,
"completions/max_terminated_length": 491.6,
"completions/mean_length": 225.32890625,
"completions/mean_terminated_length": 225.32890625,
"completions/min_length": 86.2,
"completions/min_terminated_length": 86.2,
"epoch": 0.1498457470251212,
"grad_norm": 0.22719354366888944,
"kl": 0.005328369140625,
"learning_rate": 1e-06,
"loss": 0.0129,
"num_tokens": 12668159.0,
"reward": 0.816937243938446,
"reward_std": 0.08283708170056343,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7846320390701294,
"rewards/qatch_metrics/std": 0.32469419240951536,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 85
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 460.4,
"completions/max_terminated_length": 460.4,
"completions/mean_length": 217.92890625,
"completions/mean_terminated_length": 217.92890625,
"completions/min_length": 76.2,
"completions/min_terminated_length": 76.2,
"epoch": 0.15866020273248127,
"grad_norm": 0.2721517170479785,
"kl": 0.00579071044921875,
"learning_rate": 1e-06,
"loss": 0.0117,
"num_tokens": 13413588.0,
"reward": 0.7426301956176757,
"reward_std": 0.0905102699995041,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.6972119808197021,
"rewards/qatch_metrics/std": 0.37120566368103025,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 428.6,
"completions/max_terminated_length": 428.6,
"completions/mean_length": 204.6640625,
"completions/mean_terminated_length": 204.6640625,
"completions/min_length": 75.6,
"completions/min_terminated_length": 75.6,
"epoch": 0.16747465843984133,
"grad_norm": 0.2525985499058037,
"kl": 0.0056243896484375,
"learning_rate": 1e-06,
"loss": -0.0012,
"num_tokens": 14111606.0,
"reward": 0.7979554295539856,
"reward_std": 0.06609301418066024,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7623119950294495,
"rewards/qatch_metrics/std": 0.34469759464263916,
"rewards/tag_count_reward/mean": 0.9998046875,
"rewards/tag_count_reward/std": 0.003125,
"step": 95
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 431.0,
"completions/max_terminated_length": 431.0,
"completions/mean_length": 212.34765625,
"completions/mean_terminated_length": 212.34765625,
"completions/min_length": 69.2,
"completions/min_terminated_length": 69.2,
"epoch": 0.17628911414720141,
"grad_norm": 0.30357672091416305,
"kl": 0.0057861328125,
"learning_rate": 1e-06,
"loss": 0.0083,
"num_tokens": 14876659.0,
"reward": 0.7724857568740845,
"reward_std": 0.09265935122966766,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7323476672172546,
"rewards/qatch_metrics/std": 0.33567925691604616,
"rewards/tag_count_reward/mean": 0.9998046875,
"rewards/tag_count_reward/std": 0.003125,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 463.4,
"completions/max_terminated_length": 463.4,
"completions/mean_length": 216.46875,
"completions/mean_terminated_length": 216.46875,
"completions/min_length": 80.4,
"completions/min_terminated_length": 80.4,
"epoch": 0.18510356985456147,
"grad_norm": 0.23780324977532238,
"kl": 0.0056549072265625,
"learning_rate": 1e-06,
"loss": -0.0087,
"num_tokens": 15600331.0,
"reward": 0.7508906722068787,
"reward_std": 0.0951332688331604,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7069302201271057,
"rewards/qatch_metrics/std": 0.38108278512954713,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 105
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 442.4,
"completions/max_terminated_length": 442.4,
"completions/mean_length": 216.578125,
"completions/mean_terminated_length": 216.578125,
"completions/min_length": 80.2,
"completions/min_terminated_length": 80.2,
"epoch": 0.19391802556192156,
"grad_norm": 0.21716869090526136,
"kl": 0.0054229736328125,
"learning_rate": 1e-06,
"loss": -0.0045,
"num_tokens": 16326015.0,
"reward": 0.8402611017227173,
"reward_std": 0.05716411247849464,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8120718836784363,
"rewards/qatch_metrics/std": 0.2929441839456558,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 110
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 428.2,
"completions/max_terminated_length": 428.2,
"completions/mean_length": 222.0265625,
"completions/mean_terminated_length": 222.0265625,
"completions/min_length": 78.0,
"completions/min_terminated_length": 78.0,
"epoch": 0.20273248126928162,
"grad_norm": 0.22835452896575356,
"kl": 0.0060882568359375,
"learning_rate": 1e-06,
"loss": -0.0017,
"num_tokens": 17091921.0,
"reward": 0.8265595078468323,
"reward_std": 0.07398260906338691,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7959523558616638,
"rewards/qatch_metrics/std": 0.3277123510837555,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 115
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 459.8,
"completions/max_terminated_length": 459.8,
"completions/mean_length": 220.7453125,
"completions/mean_terminated_length": 220.7453125,
"completions/min_length": 87.0,
"completions/min_terminated_length": 87.0,
"epoch": 0.2115469369766417,
"grad_norm": 0.22726862373109216,
"kl": 0.006689453125,
"learning_rate": 1e-06,
"loss": 0.0043,
"num_tokens": 17877371.0,
"reward": 0.8397867679595947,
"reward_std": 0.09087342023849487,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8115137934684753,
"rewards/qatch_metrics/std": 0.3017837733030319,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 491.4,
"completions/max_terminated_length": 491.4,
"completions/mean_length": 225.2140625,
"completions/mean_terminated_length": 225.2140625,
"completions/min_length": 75.4,
"completions/min_terminated_length": 75.4,
"epoch": 0.22036139268400176,
"grad_norm": 0.2004953082769917,
"kl": 0.00776519775390625,
"learning_rate": 1e-06,
"loss": -0.0056,
"num_tokens": 18623005.0,
"reward": 0.8202541828155517,
"reward_std": 0.07537120208144188,
"rewards/format_reward/mean": 0.99921875,
"rewards/format_reward/std": 0.0125,
"rewards/qatch_metrics/mean": 0.7886492252349854,
"rewards/qatch_metrics/std": 0.32776339948177335,
"rewards/tag_count_reward/mean": 0.999609375,
"rewards/tag_count_reward/std": 0.00625,
"step": 125
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 456.8,
"completions/max_terminated_length": 456.8,
"completions/mean_length": 223.48203125,
"completions/mean_terminated_length": 223.48203125,
"completions/min_length": 78.2,
"completions/min_terminated_length": 78.2,
"epoch": 0.22917584839136185,
"grad_norm": 0.2341532579835068,
"kl": 0.00804290771484375,
"learning_rate": 1e-06,
"loss": 0.0096,
"num_tokens": 19349606.0,
"reward": 0.8026262044906616,
"reward_std": 0.06839245334267616,
"rewards/format_reward/mean": 0.99921875,
"rewards/format_reward/std": 0.0125,
"rewards/qatch_metrics/mean": 0.7679218888282776,
"rewards/qatch_metrics/std": 0.3324147403240204,
"rewards/tag_count_reward/mean": 0.9994140625,
"rewards/tag_count_reward/std": 0.009375,
"step": 130
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 458.4,
"completions/max_terminated_length": 458.4,
"completions/mean_length": 216.72578125,
"completions/mean_terminated_length": 216.72578125,
"completions/min_length": 86.2,
"completions/min_terminated_length": 86.2,
"epoch": 0.2379903040987219,
"grad_norm": 0.23655650548465582,
"kl": 0.0078033447265625,
"learning_rate": 1e-06,
"loss": 0.0007,
"num_tokens": 20092311.0,
"reward": 0.8197526335716248,
"reward_std": 0.0839143767952919,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7879442930221557,
"rewards/qatch_metrics/std": 0.3431123554706573,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 135
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 454.6,
"completions/max_terminated_length": 454.6,
"completions/mean_length": 204.48984375,
"completions/mean_terminated_length": 204.48984375,
"completions/min_length": 79.0,
"completions/min_terminated_length": 79.0,
"epoch": 0.24680475980608196,
"grad_norm": 0.2641797202959811,
"kl": 0.00862884521484375,
"learning_rate": 1e-06,
"loss": 0.0051,
"num_tokens": 20821962.0,
"reward": 0.8242111682891846,
"reward_std": 0.07407020255923272,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7931895971298217,
"rewards/qatch_metrics/std": 0.3176054835319519,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 140
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 443.2,
"completions/max_terminated_length": 443.2,
"completions/mean_length": 203.590625,
"completions/mean_terminated_length": 203.590625,
"completions/min_length": 86.6,
"completions/min_terminated_length": 86.6,
"epoch": 0.255619215513442,
"grad_norm": 0.263066002535131,
"kl": 0.009637451171875,
"learning_rate": 1e-06,
"loss": 0.0065,
"num_tokens": 21526046.0,
"reward": 0.7875781059265137,
"reward_std": 0.09901705384254456,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7501148462295533,
"rewards/qatch_metrics/std": 0.3672972857952118,
"rewards/tag_count_reward/mean": 0.999609375,
"rewards/tag_count_reward/std": 0.00625,
"step": 145
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 448.4,
"completions/max_terminated_length": 448.4,
"completions/mean_length": 208.90546875,
"completions/mean_terminated_length": 208.90546875,
"completions/min_length": 73.2,
"completions/min_terminated_length": 73.2,
"epoch": 0.26443367122080214,
"grad_norm": 0.2798500312218402,
"kl": 0.01026153564453125,
"learning_rate": 1e-06,
"loss": 0.0003,
"num_tokens": 22271333.0,
"reward": 0.818337082862854,
"reward_std": 0.07784928977489472,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7862788915634156,
"rewards/qatch_metrics/std": 0.3341992735862732,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 494.2,
"completions/max_terminated_length": 494.2,
"completions/mean_length": 209.651953125,
"completions/mean_terminated_length": 209.651953125,
"completions/min_length": 73.0,
"completions/min_terminated_length": 73.0,
"epoch": 0.5464962538563244,
"grad_norm": 0.2122029879190087,
"kl": 0.010993194580078126,
"learning_rate": 1e-06,
"loss": 0.0126,
"num_tokens": 23726666.0,
"reward": 0.811666476726532,
"reward_std": 0.0841904804110527,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7784311413764954,
"rewards/qatch_metrics/std": 0.32770459055900575,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 155
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 452.6,
"completions/max_terminated_length": 452.6,
"completions/mean_length": 217.9859375,
"completions/mean_terminated_length": 217.9859375,
"completions/min_length": 75.8,
"completions/min_terminated_length": 75.8,
"epoch": 0.5641251652710445,
"grad_norm": 0.15403477284537095,
"kl": 0.00980377197265625,
"learning_rate": 1e-06,
"loss": 0.0008,
"num_tokens": 25239750.0,
"reward": 0.7868865132331848,
"reward_std": 0.07244862839579583,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7492782354354859,
"rewards/qatch_metrics/std": 0.3493395745754242,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 160
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 511.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 208.38984375,
"completions/mean_terminated_length": 208.38984375,
"completions/min_length": 58.4,
"completions/min_terminated_length": 58.4,
"epoch": 0.5817540766857646,
"grad_norm": 0.18706575889421317,
"kl": 0.00914154052734375,
"learning_rate": 1e-06,
"loss": 0.0072,
"num_tokens": 26687596.0,
"reward": 0.828769075870514,
"reward_std": 0.07729479111731052,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7985518336296081,
"rewards/qatch_metrics/std": 0.29670341312885284,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 165
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 505.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 206.281640625,
"completions/mean_terminated_length": 206.281640625,
"completions/min_length": 79.0,
"completions/min_terminated_length": 79.0,
"epoch": 0.5993829881004848,
"grad_norm": 0.19776450858978561,
"kl": 0.01090240478515625,
"learning_rate": 1e-06,
"loss": 0.0105,
"num_tokens": 28175773.0,
"reward": 0.8511051416397095,
"reward_std": 0.07431531846523284,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8248295664787293,
"rewards/qatch_metrics/std": 0.3192874014377594,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 170
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 462.2,
"completions/max_terminated_length": 462.2,
"completions/mean_length": 219.3890625,
"completions/mean_terminated_length": 219.3890625,
"completions/min_length": 70.8,
"completions/min_terminated_length": 70.8,
"epoch": 0.617011899515205,
"grad_norm": 0.15290022120008429,
"kl": 0.01065216064453125,
"learning_rate": 1e-06,
"loss": 0.0047,
"num_tokens": 29739969.0,
"reward": 0.8426113128662109,
"reward_std": 0.09004694148898125,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.814836847782135,
"rewards/qatch_metrics/std": 0.309688937664032,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 175
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 458.8,
"completions/max_terminated_length": 458.8,
"completions/mean_length": 211.655078125,
"completions/mean_terminated_length": 211.655078125,
"completions/min_length": 73.4,
"completions/min_terminated_length": 73.4,
"epoch": 0.6346408109299251,
"grad_norm": 0.17923424569681315,
"kl": 0.0114501953125,
"learning_rate": 1e-06,
"loss": 0.01,
"num_tokens": 31191502.0,
"reward": 0.8262084484100342,
"reward_std": 0.08637549504637718,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7955393195152283,
"rewards/qatch_metrics/std": 0.3134327620267868,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 180
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 499.2,
"completions/max_terminated_length": 499.2,
"completions/mean_length": 215.95546875,
"completions/mean_terminated_length": 215.95546875,
"completions/min_length": 78.0,
"completions/min_terminated_length": 78.0,
"epoch": 0.6522697223446452,
"grad_norm": 0.1321015357675111,
"kl": 0.012237548828125,
"learning_rate": 1e-06,
"loss": 0.0021,
"num_tokens": 32694108.0,
"reward": 0.7994898676872253,
"reward_std": 0.08254800513386726,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.764105749130249,
"rewards/qatch_metrics/std": 0.3532308578491211,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 185
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 442.2,
"completions/max_terminated_length": 442.2,
"completions/mean_length": 209.90078125,
"completions/mean_terminated_length": 209.90078125,
"completions/min_length": 76.6,
"completions/min_terminated_length": 76.6,
"epoch": 0.6698986337593653,
"grad_norm": 0.22256806005967145,
"kl": 0.01057586669921875,
"learning_rate": 1e-06,
"loss": 0.0013,
"num_tokens": 34144670.0,
"reward": 0.7911163926124573,
"reward_std": 0.06518566869199276,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7542545795440674,
"rewards/qatch_metrics/std": 0.35398219227790834,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 190
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 476.2,
"completions/max_terminated_length": 476.2,
"completions/mean_length": 208.534765625,
"completions/mean_terminated_length": 208.534765625,
"completions/min_length": 77.8,
"completions/min_terminated_length": 77.8,
"epoch": 0.6875275451740855,
"grad_norm": 0.17237028945675698,
"kl": 0.0087860107421875,
"learning_rate": 1e-06,
"loss": 0.0069,
"num_tokens": 35620023.0,
"reward": 0.8418472170829773,
"reward_std": 0.08243692219257355,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8139379024505615,
"rewards/qatch_metrics/std": 0.336453515291214,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 195
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 514.6,
"completions/max_terminated_length": 514.6,
"completions/mean_length": 217.3328125,
"completions/mean_terminated_length": 217.3328125,
"completions/min_length": 90.4,
"completions/min_terminated_length": 90.4,
"epoch": 0.7051564565888057,
"grad_norm": 0.19274445010407998,
"kl": 0.009130859375,
"learning_rate": 1e-06,
"loss": 0.0053,
"num_tokens": 37166635.0,
"reward": 0.8295193314552307,
"reward_std": 0.06927115023136139,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7994345307350159,
"rewards/qatch_metrics/std": 0.3011426508426666,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 200
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 499.8,
"completions/max_terminated_length": 499.8,
"completions/mean_length": 212.651171875,
"completions/mean_terminated_length": 212.651171875,
"completions/min_length": 68.6,
"completions/min_terminated_length": 68.6,
"epoch": 0.7227853680035258,
"grad_norm": 0.13990900967805797,
"kl": 0.0087432861328125,
"learning_rate": 1e-06,
"loss": -0.0027,
"num_tokens": 38617966.0,
"reward": 0.8151894211769104,
"reward_std": 0.07495353966951371,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7825757980346679,
"rewards/qatch_metrics/std": 0.33874245882034304,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 205
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 560.4,
"completions/max_terminated_length": 560.4,
"completions/mean_length": 223.7015625,
"completions/mean_terminated_length": 223.7015625,
"completions/min_length": 74.6,
"completions/min_terminated_length": 74.6,
"epoch": 0.7404142794182459,
"grad_norm": 0.20163985914598806,
"kl": 0.00806884765625,
"learning_rate": 1e-06,
"loss": 0.0054,
"num_tokens": 40092050.0,
"reward": 0.8460610270500183,
"reward_std": 0.05867695920169354,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8188953161239624,
"rewards/qatch_metrics/std": 0.3239317536354065,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 210
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 486.6,
"completions/max_terminated_length": 486.6,
"completions/mean_length": 215.6828125,
"completions/mean_terminated_length": 215.6828125,
"completions/min_length": 82.2,
"completions/min_terminated_length": 82.2,
"epoch": 0.7580431908329661,
"grad_norm": 0.17564998217230318,
"kl": 0.009525299072265625,
"learning_rate": 1e-06,
"loss": -0.0034,
"num_tokens": 41565542.0,
"reward": 0.799136507511139,
"reward_std": 0.06419738680124283,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7636899828910828,
"rewards/qatch_metrics/std": 0.3342160403728485,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 215
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 503.4,
"completions/max_terminated_length": 503.4,
"completions/mean_length": 233.409765625,
"completions/mean_terminated_length": 233.409765625,
"completions/min_length": 91.0,
"completions/min_terminated_length": 91.0,
"epoch": 0.7756721022476862,
"grad_norm": 0.19283324501226842,
"kl": 0.009130096435546875,
"learning_rate": 1e-06,
"loss": 0.0109,
"num_tokens": 43081919.0,
"reward": 0.7851791024208069,
"reward_std": 0.07570808604359627,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7472695469856262,
"rewards/qatch_metrics/std": 0.36822828054428103,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 220
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 448.4,
"completions/max_terminated_length": 448.4,
"completions/mean_length": 224.728125,
"completions/mean_terminated_length": 224.728125,
"completions/min_length": 81.0,
"completions/min_terminated_length": 81.0,
"epoch": 0.7933010136624064,
"grad_norm": 0.17754847688569442,
"kl": 0.009470367431640625,
"learning_rate": 1e-06,
"loss": -0.002,
"num_tokens": 44606439.0,
"reward": 0.8152384400367737,
"reward_std": 0.09764492362737656,
"rewards/format_reward/mean": 0.999609375,
"rewards/format_reward/std": 0.00883883461356163,
"rewards/qatch_metrics/mean": 0.7826851725578308,
"rewards/qatch_metrics/std": 0.3263732075691223,
"rewards/tag_count_reward/mean": 0.99990234375,
"rewards/tag_count_reward/std": 0.0022097086533904076,
"step": 225
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 503.2,
"completions/max_terminated_length": 503.2,
"completions/mean_length": 218.58203125,
"completions/mean_terminated_length": 218.58203125,
"completions/min_length": 69.6,
"completions/min_terminated_length": 69.6,
"epoch": 0.8109299250771265,
"grad_norm": 0.19017267970498908,
"kl": 0.009508514404296875,
"learning_rate": 1e-06,
"loss": -0.0009,
"num_tokens": 46095257.0,
"reward": 0.8068280577659607,
"reward_std": 0.0781441181898117,
"rewards/format_reward/mean": 0.999609375,
"rewards/format_reward/std": 0.00883883461356163,
"rewards/qatch_metrics/mean": 0.7728020906448364,
"rewards/qatch_metrics/std": 0.3386655867099762,
"rewards/tag_count_reward/mean": 0.99970703125,
"rewards/tag_count_reward/std": 0.006629125773906707,
"step": 230
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 456.6,
"completions/max_terminated_length": 456.6,
"completions/mean_length": 204.84375,
"completions/mean_terminated_length": 204.84375,
"completions/min_length": 72.6,
"completions/min_terminated_length": 72.6,
"epoch": 0.8285588364918466,
"grad_norm": 0.1678878918468119,
"kl": 0.009729766845703125,
"learning_rate": 1e-06,
"loss": 0.0041,
"num_tokens": 47519433.0,
"reward": 0.8672606706619262,
"reward_std": 0.0644603468477726,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8438360691070557,
"rewards/qatch_metrics/std": 0.2717843741178513,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 235
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 526.2,
"completions/max_terminated_length": 526.2,
"completions/mean_length": 214.90625,
"completions/mean_terminated_length": 214.90625,
"completions/min_length": 66.8,
"completions/min_terminated_length": 66.8,
"epoch": 0.8461877479065668,
"grad_norm": 0.18169011669761398,
"kl": 0.01288604736328125,
"learning_rate": 1e-06,
"loss": 0.0035,
"num_tokens": 48943993.0,
"reward": 0.8558493018150329,
"reward_std": 0.07027828097343444,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8304109454154969,
"rewards/qatch_metrics/std": 0.301141357421875,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 240
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 472.6,
"completions/max_terminated_length": 472.6,
"completions/mean_length": 212.559765625,
"completions/mean_terminated_length": 212.559765625,
"completions/min_length": 76.8,
"completions/min_terminated_length": 76.8,
"epoch": 0.8638166593212869,
"grad_norm": 0.2046340854229955,
"kl": 0.01494140625,
"learning_rate": 1e-06,
"loss": 0.006,
"num_tokens": 50416114.0,
"reward": 0.831060528755188,
"reward_std": 0.07754805404692888,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8012476563453674,
"rewards/qatch_metrics/std": 0.3293557226657867,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 245
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 503.2,
"completions/max_terminated_length": 503.2,
"completions/mean_length": 222.1375,
"completions/mean_terminated_length": 222.1375,
"completions/min_length": 83.2,
"completions/min_terminated_length": 83.2,
"epoch": 0.881445570736007,
"grad_norm": 0.15161264539796646,
"kl": 0.0138031005859375,
"learning_rate": 1e-06,
"loss": 0.0018,
"num_tokens": 51932274.0,
"reward": 0.8422249555587769,
"reward_std": 0.06234893724322319,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8143823027610779,
"rewards/qatch_metrics/std": 0.2993943512439728,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 250
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 463.6,
"completions/max_terminated_length": 463.6,
"completions/mean_length": 231.19609375,
"completions/mean_terminated_length": 231.19609375,
"completions/min_length": 77.4,
"completions/min_terminated_length": 77.4,
"epoch": 0.8990744821507272,
"grad_norm": 0.20035266636054513,
"kl": 0.011871337890625,
"learning_rate": 1e-06,
"loss": -0.0003,
"num_tokens": 53450248.0,
"reward": 0.8096501588821411,
"reward_std": 0.06698438860476016,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7760589838027954,
"rewards/qatch_metrics/std": 0.3199191153049469,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 255
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 471.2,
"completions/max_terminated_length": 471.2,
"completions/mean_length": 237.80234375,
"completions/mean_terminated_length": 237.80234375,
"completions/min_length": 82.2,
"completions/min_terminated_length": 82.2,
"epoch": 0.9167033935654474,
"grad_norm": 0.0856229450795828,
"kl": 0.011614227294921875,
"learning_rate": 1e-06,
"loss": -0.0017,
"num_tokens": 54970542.0,
"reward": 0.8725608706474304,
"reward_std": 0.051827043667435645,
"rewards/format_reward/mean": 0.999609375,
"rewards/format_reward/std": 0.00883883461356163,
"rewards/qatch_metrics/mean": 0.8501232981681823,
"rewards/qatch_metrics/std": 0.26386110931634904,
"rewards/tag_count_reward/mean": 0.99990234375,
"rewards/tag_count_reward/std": 0.0022097086533904076,
"step": 260
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 476.6,
"completions/max_terminated_length": 476.6,
"completions/mean_length": 231.53671875,
"completions/mean_terminated_length": 231.53671875,
"completions/min_length": 79.0,
"completions/min_terminated_length": 79.0,
"epoch": 0.9343323049801675,
"grad_norm": 0.17178453068271043,
"kl": 0.010117340087890624,
"learning_rate": 1e-06,
"loss": 0.0063,
"num_tokens": 56485356.0,
"reward": 0.8532873392105103,
"reward_std": 0.07009301483631133,
"rewards/format_reward/mean": 0.999609375,
"rewards/format_reward/std": 0.00883883461356163,
"rewards/qatch_metrics/mean": 0.8274485826492309,
"rewards/qatch_metrics/std": 0.31240676045417787,
"rewards/tag_count_reward/mean": 0.99990234375,
"rewards/tag_count_reward/std": 0.0022097086533904076,
"step": 265
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 459.6,
"completions/max_terminated_length": 459.6,
"completions/mean_length": 220.95234375,
"completions/mean_terminated_length": 220.95234375,
"completions/min_length": 68.6,
"completions/min_terminated_length": 68.6,
"epoch": 0.9519612163948876,
"grad_norm": 0.15364550208264494,
"kl": 0.00984039306640625,
"learning_rate": 1e-06,
"loss": -0.0031,
"num_tokens": 57953010.0,
"reward": 0.868242597579956,
"reward_std": 0.06916632130742073,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8449912905693054,
"rewards/qatch_metrics/std": 0.2899660974740982,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 270
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 423.2,
"completions/max_terminated_length": 423.2,
"completions/mean_length": 225.621875,
"completions/mean_terminated_length": 225.621875,
"completions/min_length": 88.8,
"completions/min_terminated_length": 88.8,
"epoch": 0.48479506390480387,
"grad_norm": 0.17697767584196022,
"kl": 0.00970916748046875,
"learning_rate": 1e-06,
"loss": 0.0059,
"num_tokens": 58736110.0,
"reward": 0.8460039258003235,
"reward_std": 0.055821475386619565,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8188281297683716,
"rewards/qatch_metrics/std": 0.30660555958747865,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 275
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 504.0,
"completions/max_terminated_length": 504.0,
"completions/mean_length": 223.92734375,
"completions/mean_terminated_length": 223.92734375,
"completions/min_length": 77.8,
"completions/min_terminated_length": 77.8,
"epoch": 0.4936095196121639,
"grad_norm": 0.2692630701899735,
"kl": 0.0131378173828125,
"learning_rate": 1e-06,
"loss": 0.0021,
"num_tokens": 59498897.0,
"reward": 0.7988754034042358,
"reward_std": 0.08376505076885224,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7633828163146973,
"rewards/qatch_metrics/std": 0.3335907101631165,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 280
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 434.8,
"completions/max_terminated_length": 434.8,
"completions/mean_length": 223.48125,
"completions/mean_terminated_length": 223.48125,
"completions/min_length": 83.2,
"completions/min_terminated_length": 83.2,
"epoch": 0.502423975319524,
"grad_norm": 0.2666009697829767,
"kl": 0.0107269287109375,
"learning_rate": 1e-06,
"loss": 0.0007,
"num_tokens": 60277897.0,
"reward": 0.7720089554786682,
"reward_std": 0.0594131164252758,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7317867279052734,
"rewards/qatch_metrics/std": 0.33845625519752504,
"rewards/tag_count_reward/mean": 0.9998046875,
"rewards/tag_count_reward/std": 0.003125,
"step": 285
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 481.0,
"completions/max_terminated_length": 481.0,
"completions/mean_length": 219.62421875,
"completions/mean_terminated_length": 219.62421875,
"completions/min_length": 91.0,
"completions/min_terminated_length": 91.0,
"epoch": 0.511238431026884,
"grad_norm": 0.16876063412105669,
"kl": 0.01141357421875,
"learning_rate": 1e-06,
"loss": 0.0033,
"num_tokens": 61033560.0,
"reward": 0.7902166962623596,
"reward_std": 0.0687429528683424,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7531961083412171,
"rewards/qatch_metrics/std": 0.37054654359817507,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 290
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 471.2,
"completions/max_terminated_length": 471.2,
"completions/mean_length": 226.275,
"completions/mean_terminated_length": 226.275,
"completions/min_length": 80.6,
"completions/min_terminated_length": 80.6,
"epoch": 0.5200528867342442,
"grad_norm": 0.26818466602074054,
"kl": 0.0130706787109375,
"learning_rate": 1e-06,
"loss": 0.0004,
"num_tokens": 61786008.0,
"reward": 0.7699209451675415,
"reward_std": 0.07550354823470115,
"rewards/format_reward/mean": 0.99921875,
"rewards/format_reward/std": 0.0125,
"rewards/qatch_metrics/mean": 0.7294221520423889,
"rewards/qatch_metrics/std": 0.3492735385894775,
"rewards/tag_count_reward/mean": 0.9998046875,
"rewards/tag_count_reward/std": 0.003125,
"step": 295
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 484.6,
"completions/max_terminated_length": 484.6,
"completions/mean_length": 247.2328125,
"completions/mean_terminated_length": 247.2328125,
"completions/min_length": 95.8,
"completions/min_terminated_length": 95.8,
"epoch": 0.5288673424416043,
"grad_norm": 0.16485515882678206,
"kl": 0.0113006591796875,
"learning_rate": 1e-06,
"loss": 0.0007,
"num_tokens": 62590434.0,
"reward": 0.8454334974288941,
"reward_std": 0.0570029616355896,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8181570172309875,
"rewards/qatch_metrics/std": 0.2992805689573288,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 300
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 480.6,
"completions/max_terminated_length": 480.6,
"completions/mean_length": 235.16015625,
"completions/mean_terminated_length": 235.16015625,
"completions/min_length": 88.0,
"completions/min_terminated_length": 88.0,
"epoch": 0.5376817981489643,
"grad_norm": 0.27561378534620606,
"kl": 0.01141510009765625,
"learning_rate": 1e-06,
"loss": 0.0097,
"num_tokens": 63366287.0,
"reward": 0.8380108118057251,
"reward_std": 0.07530387155711651,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8094244718551635,
"rewards/qatch_metrics/std": 0.30977231860160825,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 305
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 458.6,
"completions/max_terminated_length": 458.6,
"completions/mean_length": 215.646875,
"completions/mean_terminated_length": 215.646875,
"completions/min_length": 79.4,
"completions/min_terminated_length": 79.4,
"epoch": 0.5464962538563244,
"grad_norm": 0.2018916915779266,
"kl": 0.013714599609375,
"learning_rate": 1e-06,
"loss": -0.0045,
"num_tokens": 64097387.0,
"reward": 0.8135073184967041,
"reward_std": 0.05950811579823494,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7806198120117187,
"rewards/qatch_metrics/std": 0.33523867428302767,
"rewards/tag_count_reward/mean": 0.999609375,
"rewards/tag_count_reward/std": 0.00625,
"step": 310
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 447.4,
"completions/max_terminated_length": 447.4,
"completions/mean_length": 223.68515625,
"completions/mean_terminated_length": 223.68515625,
"completions/min_length": 92.4,
"completions/min_terminated_length": 92.4,
"epoch": 0.5553107095636844,
"grad_norm": 0.1836962735356692,
"kl": 0.0138214111328125,
"learning_rate": 1e-06,
"loss": -0.0024,
"num_tokens": 64869416.0,
"reward": 0.8333834052085877,
"reward_std": 0.07006162852048874,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8039804816246032,
"rewards/qatch_metrics/std": 0.3219245493412018,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 315
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 504.8,
"completions/max_terminated_length": 504.8,
"completions/mean_length": 221.45390625,
"completions/mean_terminated_length": 221.45390625,
"completions/min_length": 80.0,
"completions/min_terminated_length": 80.0,
"epoch": 0.5641251652710445,
"grad_norm": 0.23250178423343035,
"kl": 0.01497802734375,
"learning_rate": 1e-06,
"loss": -0.0014,
"num_tokens": 65613165.0,
"reward": 0.8320096850395202,
"reward_std": 0.053499556705355646,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8023643255233764,
"rewards/qatch_metrics/std": 0.3343039393424988,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 320
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 466.6,
"completions/max_terminated_length": 466.6,
"completions/mean_length": 220.3984375,
"completions/mean_terminated_length": 220.3984375,
"completions/min_length": 72.6,
"completions/min_terminated_length": 72.6,
"epoch": 0.5729396209784046,
"grad_norm": 0.09740281424559781,
"kl": 0.0155609130859375,
"learning_rate": 1e-06,
"loss": -0.0012,
"num_tokens": 66336475.0,
"reward": 0.8796087980270386,
"reward_std": 0.05236431676894426,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8583632946014405,
"rewards/qatch_metrics/std": 0.2817832052707672,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 325
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 459.4,
"completions/max_terminated_length": 459.4,
"completions/mean_length": 225.27890625,
"completions/mean_terminated_length": 225.27890625,
"completions/min_length": 78.6,
"completions/min_terminated_length": 78.6,
"epoch": 0.5817540766857646,
"grad_norm": 0.08354955287926201,
"kl": 0.01513671875,
"learning_rate": 1e-06,
"loss": 0.0095,
"num_tokens": 67098736.0,
"reward": 0.8658102512359619,
"reward_std": 0.07466748803853988,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8421296954154969,
"rewards/qatch_metrics/std": 0.2614422976970673,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 330
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 447.2,
"completions/max_terminated_length": 447.2,
"completions/mean_length": 220.01875,
"completions/mean_terminated_length": 220.01875,
"completions/min_length": 81.8,
"completions/min_terminated_length": 81.8,
"epoch": 0.5905685323931247,
"grad_norm": 0.20574209747901576,
"kl": 0.015081787109375,
"learning_rate": 1e-06,
"loss": 0.011,
"num_tokens": 67847928.0,
"reward": 0.865822184085846,
"reward_std": 0.046268445625901225,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8421437621116639,
"rewards/qatch_metrics/std": 0.29589260220527647,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 335
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 432.0,
"completions/max_terminated_length": 432.0,
"completions/mean_length": 213.5953125,
"completions/mean_terminated_length": 213.5953125,
"completions/min_length": 76.0,
"completions/min_terminated_length": 76.0,
"epoch": 0.5993829881004848,
"grad_norm": 0.2039975034177896,
"kl": 0.0161651611328125,
"learning_rate": 1e-06,
"loss": 0.0066,
"num_tokens": 68585234.0,
"reward": 0.8343551635742188,
"reward_std": 0.0688902921974659,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8051237106323242,
"rewards/qatch_metrics/std": 0.30847290754318235,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 340
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 435.0,
"completions/max_terminated_length": 435.0,
"completions/mean_length": 203.69453125,
"completions/mean_terminated_length": 203.69453125,
"completions/min_length": 76.0,
"completions/min_terminated_length": 76.0,
"epoch": 0.6081974438078449,
"grad_norm": 0.26848084439203446,
"kl": 0.014788818359375,
"learning_rate": 1e-06,
"loss": 0.0023,
"num_tokens": 69338379.0,
"reward": 0.8848124146461487,
"reward_std": 0.06373886093497276,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8644851684570313,
"rewards/qatch_metrics/std": 0.26705425381660464,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 345
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 446.8,
"completions/max_terminated_length": 446.8,
"completions/mean_length": 221.1046875,
"completions/mean_terminated_length": 221.1046875,
"completions/min_length": 79.4,
"completions/min_terminated_length": 79.4,
"epoch": 0.617011899515205,
"grad_norm": 0.2363792510293019,
"kl": 0.019024658203125,
"learning_rate": 1e-06,
"loss": -0.002,
"num_tokens": 70095473.0,
"reward": 0.8130708336830139,
"reward_std": 0.08477363213896752,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7800833344459533,
"rewards/qatch_metrics/std": 0.3211198329925537,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 350
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 458.4,
"completions/max_terminated_length": 458.4,
"completions/mean_length": 234.00078125,
"completions/mean_terminated_length": 234.00078125,
"completions/min_length": 91.4,
"completions/min_terminated_length": 91.4,
"epoch": 0.625826355222565,
"grad_norm": 0.1856420640121193,
"kl": 0.019122314453125,
"learning_rate": 1e-06,
"loss": -0.0036,
"num_tokens": 70860290.0,
"reward": 0.8471660256385803,
"reward_std": 0.0506692998111248,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8201953172683716,
"rewards/qatch_metrics/std": 0.30663308799266814,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 355
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 452.8,
"completions/max_terminated_length": 452.8,
"completions/mean_length": 241.72421875,
"completions/mean_terminated_length": 241.72421875,
"completions/min_length": 88.0,
"completions/min_terminated_length": 88.0,
"epoch": 0.6346408109299251,
"grad_norm": 0.22939974521057024,
"kl": 0.01826171875,
"learning_rate": 1e-06,
"loss": 0.0127,
"num_tokens": 71648401.0,
"reward": 0.8702264785766601,
"reward_std": 0.0592925101518631,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8473252534866333,
"rewards/qatch_metrics/std": 0.28537269234657286,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 360
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 434.6,
"completions/max_terminated_length": 434.6,
"completions/mean_length": 215.853125,
"completions/mean_terminated_length": 215.853125,
"completions/min_length": 75.0,
"completions/min_terminated_length": 75.0,
"epoch": 0.6434552666372851,
"grad_norm": 0.19883621919511643,
"kl": 0.0163330078125,
"learning_rate": 1e-06,
"loss": 0.0073,
"num_tokens": 72382693.0,
"reward": 0.8091506719589233,
"reward_std": 0.0635421834886074,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7754713773727417,
"rewards/qatch_metrics/std": 0.3179103255271912,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 365
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 496.8,
"completions/max_terminated_length": 496.8,
"completions/mean_length": 213.1640625,
"completions/mean_terminated_length": 213.1640625,
"completions/min_length": 76.2,
"completions/min_terminated_length": 76.2,
"epoch": 0.6522697223446452,
"grad_norm": 0.1916457590662772,
"kl": 0.0175506591796875,
"learning_rate": 1e-06,
"loss": -0.0029,
"num_tokens": 73161111.0,
"reward": 0.8094798445701599,
"reward_std": 0.04875086285173893,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7758586168289184,
"rewards/qatch_metrics/std": 0.32606661319732666,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 370
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 480.4,
"completions/max_terminated_length": 480.4,
"completions/mean_length": 222.75,
"completions/mean_terminated_length": 222.75,
"completions/min_length": 72.6,
"completions/min_terminated_length": 72.6,
"epoch": 0.6610841780520053,
"grad_norm": 0.15787517122504152,
"kl": 0.0181884765625,
"learning_rate": 1e-06,
"loss": 0.0018,
"num_tokens": 73905591.0,
"reward": 0.89048171043396,
"reward_std": 0.04932568361982703,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8711549639701843,
"rewards/qatch_metrics/std": 0.2736783862113953,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 375
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 561.4,
"completions/max_terminated_length": 561.4,
"completions/mean_length": 234.4390625,
"completions/mean_terminated_length": 234.4390625,
"completions/min_length": 75.2,
"completions/min_terminated_length": 75.2,
"epoch": 0.6698986337593653,
"grad_norm": 0.2653930596733297,
"kl": 0.0174713134765625,
"learning_rate": 1e-06,
"loss": 0.003,
"num_tokens": 74679801.0,
"reward": 0.8243065714836121,
"reward_std": 0.06958894729614258,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7933018207550049,
"rewards/qatch_metrics/std": 0.3086866676807404,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 380
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 511.8,
"completions/max_terminated_length": 511.8,
"completions/mean_length": 243.73359375,
"completions/mean_terminated_length": 243.73359375,
"completions/min_length": 81.6,
"completions/min_terminated_length": 81.6,
"epoch": 0.6787130894667255,
"grad_norm": 0.20233916054675122,
"kl": 0.014093017578125,
"learning_rate": 1e-06,
"loss": 0.0048,
"num_tokens": 75445892.0,
"reward": 0.8653998494148254,
"reward_std": 0.07132081612944603,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8416468739509583,
"rewards/qatch_metrics/std": 0.3147186517715454,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 385
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 447.4,
"completions/max_terminated_length": 447.4,
"completions/mean_length": 228.43671875,
"completions/mean_terminated_length": 228.43671875,
"completions/min_length": 79.8,
"completions/min_terminated_length": 79.8,
"epoch": 0.6875275451740855,
"grad_norm": 0.29996778931865303,
"kl": 0.0146087646484375,
"learning_rate": 1e-06,
"loss": -0.0016,
"num_tokens": 76229251.0,
"reward": 0.8502862334251404,
"reward_std": 0.07314281612634659,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8238661646842956,
"rewards/qatch_metrics/std": 0.3113024443387985,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 390
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 487.0,
"completions/max_terminated_length": 487.0,
"completions/mean_length": 249.8734375,
"completions/mean_terminated_length": 249.8734375,
"completions/min_length": 84.4,
"completions/min_terminated_length": 84.4,
"epoch": 0.6963420008814456,
"grad_norm": 0.2150032953896288,
"kl": 0.017156982421875,
"learning_rate": 1e-06,
"loss": -0.0029,
"num_tokens": 77021793.0,
"reward": 0.8494030237197876,
"reward_std": 0.05776047557592392,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8228270888328553,
"rewards/qatch_metrics/std": 0.3020846724510193,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 395
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 471.4,
"completions/max_terminated_length": 471.4,
"completions/mean_length": 247.9828125,
"completions/mean_terminated_length": 247.9828125,
"completions/min_length": 84.4,
"completions/min_terminated_length": 84.4,
"epoch": 0.7051564565888057,
"grad_norm": 0.2754041387856829,
"kl": 0.0148590087890625,
"learning_rate": 1e-06,
"loss": 0.0065,
"num_tokens": 77833451.0,
"reward": 0.8363431453704834,
"reward_std": 0.06054745838046074,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.807462501525879,
"rewards/qatch_metrics/std": 0.29668720066547394,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 400
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 442.6,
"completions/max_terminated_length": 442.6,
"completions/mean_length": 225.4296875,
"completions/mean_terminated_length": 225.4296875,
"completions/min_length": 83.0,
"completions/min_terminated_length": 83.0,
"epoch": 0.7139709122961657,
"grad_norm": 0.22420011771594078,
"kl": 0.017706298828125,
"learning_rate": 1e-06,
"loss": 0.0019,
"num_tokens": 78585793.0,
"reward": 0.8382049560546875,
"reward_std": 0.05150428526103497,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8096528768539428,
"rewards/qatch_metrics/std": 0.2925006330013275,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 405
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 618.4,
"completions/max_terminated_length": 618.4,
"completions/mean_length": 219.3421875,
"completions/mean_terminated_length": 219.3421875,
"completions/min_length": 84.6,
"completions/min_terminated_length": 84.6,
"epoch": 0.7227853680035258,
"grad_norm": 0.0986589707089894,
"kl": 0.0170196533203125,
"learning_rate": 1e-06,
"loss": 0.0022,
"num_tokens": 79352135.0,
"reward": 0.8465274453163147,
"reward_std": 0.05231629386544227,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8194440126419067,
"rewards/qatch_metrics/std": 0.3004340440034866,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 410
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 470.6,
"completions/max_terminated_length": 470.6,
"completions/mean_length": 213.9921875,
"completions/mean_terminated_length": 213.9921875,
"completions/min_length": 83.6,
"completions/min_terminated_length": 83.6,
"epoch": 0.7315998237108858,
"grad_norm": 0.17969166348358623,
"kl": 0.01600341796875,
"learning_rate": 1e-06,
"loss": 0.0196,
"num_tokens": 80093021.0,
"reward": 0.7899853944778442,
"reward_std": 0.06183199286460876,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7529239773750305,
"rewards/qatch_metrics/std": 0.32831716537475586,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 415
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 458.4,
"completions/max_terminated_length": 458.4,
"completions/mean_length": 208.25390625,
"completions/mean_terminated_length": 208.25390625,
"completions/min_length": 72.6,
"completions/min_terminated_length": 72.6,
"epoch": 0.7404142794182459,
"grad_norm": 0.12360613268228073,
"kl": 0.0170166015625,
"learning_rate": 1e-06,
"loss": -0.0011,
"num_tokens": 80810658.0,
"reward": 0.8781363725662231,
"reward_std": 0.04314489997923374,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8566309928894043,
"rewards/qatch_metrics/std": 0.2832080274820328,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 420
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 447.2,
"completions/max_terminated_length": 447.2,
"completions/mean_length": 203.98046875,
"completions/mean_terminated_length": 203.98046875,
"completions/min_length": 79.2,
"completions/min_terminated_length": 79.2,
"epoch": 0.749228735125606,
"grad_norm": 0.210810313322166,
"kl": 0.0164581298828125,
"learning_rate": 1e-06,
"loss": 0.003,
"num_tokens": 81548361.0,
"reward": 0.8270991563796997,
"reward_std": 0.06941422820091248,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7965872406959533,
"rewards/qatch_metrics/std": 0.33117216229438784,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 425
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 434.6,
"completions/max_terminated_length": 434.6,
"completions/mean_length": 220.75703125,
"completions/mean_terminated_length": 220.75703125,
"completions/min_length": 80.4,
"completions/min_terminated_length": 80.4,
"epoch": 0.7580431908329661,
"grad_norm": 0.21910688267881026,
"kl": 0.016754150390625,
"learning_rate": 1e-06,
"loss": -0.0012,
"num_tokens": 82290706.0,
"reward": 0.8464880228042603,
"reward_std": 0.04884184449911118,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8193976640701294,
"rewards/qatch_metrics/std": 0.28375020921230315,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 430
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 451.8,
"completions/max_terminated_length": 451.8,
"completions/mean_length": 223.1234375,
"completions/mean_terminated_length": 223.1234375,
"completions/min_length": 85.0,
"completions/min_terminated_length": 85.0,
"epoch": 0.7668576465403262,
"grad_norm": 0.26253720274856984,
"kl": 0.0178009033203125,
"learning_rate": 1e-06,
"loss": -0.0023,
"num_tokens": 83056976.0,
"reward": 0.8096219301223755,
"reward_std": 0.07494284212589264,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7760257959365845,
"rewards/qatch_metrics/std": 0.3492628037929535,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 435
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 501.2,
"completions/max_terminated_length": 501.2,
"completions/mean_length": 216.840625,
"completions/mean_terminated_length": 216.840625,
"completions/min_length": 88.8,
"completions/min_terminated_length": 88.8,
"epoch": 0.7756721022476862,
"grad_norm": 0.27647079947407377,
"kl": 0.0181732177734375,
"learning_rate": 1e-06,
"loss": 0.0012,
"num_tokens": 83805044.0,
"reward": 0.7776495218276978,
"reward_std": 0.056884029135108,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7384112119674683,
"rewards/qatch_metrics/std": 0.3683965981006622,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 440
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 455.0,
"completions/max_terminated_length": 455.0,
"completions/mean_length": 216.46015625,
"completions/mean_terminated_length": 216.46015625,
"completions/min_length": 78.2,
"completions/min_terminated_length": 78.2,
"epoch": 0.7844865579550463,
"grad_norm": 0.20996305667402082,
"kl": 0.0163116455078125,
"learning_rate": 1e-06,
"loss": 0.0063,
"num_tokens": 84571313.0,
"reward": 0.8477118849754334,
"reward_std": 0.06959039457142353,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8208375215530396,
"rewards/qatch_metrics/std": 0.30095059871673585,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 445
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 443.4,
"completions/max_terminated_length": 443.4,
"completions/mean_length": 211.00234375,
"completions/mean_terminated_length": 211.00234375,
"completions/min_length": 86.2,
"completions/min_terminated_length": 86.2,
"epoch": 0.7933010136624064,
"grad_norm": 0.15662206787116065,
"kl": 0.0160797119140625,
"learning_rate": 1e-06,
"loss": 0.0004,
"num_tokens": 85319188.0,
"reward": 0.8328658938407898,
"reward_std": 0.05801869332790375,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8033716320991516,
"rewards/qatch_metrics/std": 0.3037038058042526,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 450
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 475.2,
"completions/max_terminated_length": 475.2,
"completions/mean_length": 222.7078125,
"completions/mean_terminated_length": 222.7078125,
"completions/min_length": 80.8,
"completions/min_terminated_length": 80.8,
"epoch": 0.8021154693697664,
"grad_norm": 0.19919629119501958,
"kl": 0.01639404296875,
"learning_rate": 1e-06,
"loss": 0.001,
"num_tokens": 86091774.0,
"reward": 0.8358211517333984,
"reward_std": 0.0607087716460228,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8068713665008544,
"rewards/qatch_metrics/std": 0.30334635376930236,
"rewards/tag_count_reward/mean": 0.999609375,
"rewards/tag_count_reward/std": 0.00625,
"step": 455
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 418.4,
"completions/max_terminated_length": 418.4,
"completions/mean_length": 221.0015625,
"completions/mean_terminated_length": 221.0015625,
"completions/min_length": 80.4,
"completions/min_terminated_length": 80.4,
"epoch": 0.8109299250771265,
"grad_norm": 0.1419366062228353,
"kl": 0.01617431640625,
"learning_rate": 1e-06,
"loss": 0.0031,
"num_tokens": 86848528.0,
"reward": 0.8028954148292542,
"reward_std": 0.06934207193553447,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7681122660636902,
"rewards/qatch_metrics/std": 0.3390295565128326,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 460
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 426.6,
"completions/max_terminated_length": 426.6,
"completions/mean_length": 210.6140625,
"completions/mean_terminated_length": 210.6140625,
"completions/min_length": 85.0,
"completions/min_terminated_length": 85.0,
"epoch": 0.8197443807844865,
"grad_norm": 0.16116384181364513,
"kl": 0.0162078857421875,
"learning_rate": 1e-06,
"loss": 0.013,
"num_tokens": 87564482.0,
"reward": 0.8424649000167846,
"reward_std": 0.040234316140413284,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8146645903587342,
"rewards/qatch_metrics/std": 0.2840981811285019,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 465
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 447.6,
"completions/max_terminated_length": 447.6,
"completions/mean_length": 195.89140625,
"completions/mean_terminated_length": 195.89140625,
"completions/min_length": 80.8,
"completions/min_terminated_length": 80.8,
"epoch": 0.8285588364918466,
"grad_norm": 0.21075371504226795,
"kl": 0.0193115234375,
"learning_rate": 1e-06,
"loss": 0.001,
"num_tokens": 88255159.0,
"reward": 0.8565711379051208,
"reward_std": 0.06344871073961258,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8312601566314697,
"rewards/qatch_metrics/std": 0.3075568675994873,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 470
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 445.4,
"completions/max_terminated_length": 445.4,
"completions/mean_length": 201.01484375,
"completions/mean_terminated_length": 201.01484375,
"completions/min_length": 85.0,
"completions/min_terminated_length": 85.0,
"epoch": 0.8373732921992068,
"grad_norm": 0.27204162033836665,
"kl": 0.019976806640625,
"learning_rate": 1e-06,
"loss": -0.0009,
"num_tokens": 88945690.0,
"reward": 0.8785177230834961,
"reward_std": 0.06470721438527108,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8570796966552734,
"rewards/qatch_metrics/std": 0.2825317859649658,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 475
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 497.4,
"completions/max_terminated_length": 497.4,
"completions/mean_length": 221.90859375,
"completions/mean_terminated_length": 221.90859375,
"completions/min_length": 81.2,
"completions/min_terminated_length": 81.2,
"epoch": 0.8461877479065668,
"grad_norm": 0.19323853705899263,
"kl": 0.0183746337890625,
"learning_rate": 1e-06,
"loss": 0.0041,
"num_tokens": 89712373.0,
"reward": 0.8555493712425232,
"reward_std": 0.06230065375566483,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8300580739974975,
"rewards/qatch_metrics/std": 0.28706649839878084,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 480
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 498.8,
"completions/max_terminated_length": 498.8,
"completions/mean_length": 228.7890625,
"completions/mean_terminated_length": 228.7890625,
"completions/min_length": 88.4,
"completions/min_terminated_length": 88.4,
"epoch": 0.8550022036139269,
"grad_norm": 0.24770714763886528,
"kl": 0.0176513671875,
"learning_rate": 1e-06,
"loss": 0.004,
"num_tokens": 90520071.0,
"reward": 0.8527018785476684,
"reward_std": 0.062195781618356705,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8267080783843994,
"rewards/qatch_metrics/std": 0.2996180385351181,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 485
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 480.6,
"completions/max_terminated_length": 480.6,
"completions/mean_length": 227.66875,
"completions/mean_terminated_length": 227.66875,
"completions/min_length": 83.2,
"completions/min_terminated_length": 83.2,
"epoch": 0.8638166593212869,
"grad_norm": 0.16162980170931898,
"kl": 0.0188812255859375,
"learning_rate": 1e-06,
"loss": 0.0006,
"num_tokens": 91278479.0,
"reward": 0.8309607028961181,
"reward_std": 0.0656251635402441,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8011302351951599,
"rewards/qatch_metrics/std": 0.31802850365638735,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 490
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 502.4,
"completions/max_terminated_length": 502.4,
"completions/mean_length": 225.3171875,
"completions/mean_terminated_length": 225.3171875,
"completions/min_length": 80.4,
"completions/min_terminated_length": 80.4,
"epoch": 0.872631115028647,
"grad_norm": 0.1886973597841831,
"kl": 0.01859130859375,
"learning_rate": 1e-06,
"loss": 0.0052,
"num_tokens": 92033173.0,
"reward": 0.8441248655319213,
"reward_std": 0.043570340052247046,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8166174769401551,
"rewards/qatch_metrics/std": 0.30278873145580293,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 495
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 443.6,
"completions/max_terminated_length": 443.6,
"completions/mean_length": 234.475,
"completions/mean_terminated_length": 234.475,
"completions/min_length": 99.8,
"completions/min_terminated_length": 99.8,
"epoch": 0.881445570736007,
"grad_norm": 0.24444756963754977,
"kl": 0.01798095703125,
"learning_rate": 1e-06,
"loss": 0.007,
"num_tokens": 92808293.0,
"reward": 0.8517020106315613,
"reward_std": 0.06295906975865365,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.825531804561615,
"rewards/qatch_metrics/std": 0.3100520223379135,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 500
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 440.0,
"completions/max_terminated_length": 440.0,
"completions/mean_length": 215.0328125,
"completions/mean_terminated_length": 215.0328125,
"completions/min_length": 84.2,
"completions/min_terminated_length": 84.2,
"epoch": 0.8902600264433671,
"grad_norm": 0.21103775626066984,
"kl": 0.0171600341796875,
"learning_rate": 1e-06,
"loss": 0.0051,
"num_tokens": 93563327.0,
"reward": 0.8682243466377259,
"reward_std": 0.04365142099559307,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8449697852134704,
"rewards/qatch_metrics/std": 0.2696381151676178,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 505
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 463.4,
"completions/max_terminated_length": 463.4,
"completions/mean_length": 218.59453125,
"completions/mean_terminated_length": 218.59453125,
"completions/min_length": 77.0,
"completions/min_terminated_length": 77.0,
"epoch": 0.8990744821507272,
"grad_norm": 0.20107359914643413,
"kl": 0.016455078125,
"learning_rate": 1e-06,
"loss": 0.0086,
"num_tokens": 94333288.0,
"reward": 0.8064153909683227,
"reward_std": 0.06192653328180313,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.772253406047821,
"rewards/qatch_metrics/std": 0.3227865040302277,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 510
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 443.0,
"completions/max_terminated_length": 443.0,
"completions/mean_length": 206.5703125,
"completions/mean_terminated_length": 206.5703125,
"completions/min_length": 73.0,
"completions/min_terminated_length": 73.0,
"epoch": 0.9078889378580872,
"grad_norm": 0.10741725097461949,
"kl": 0.0163330078125,
"learning_rate": 1e-06,
"loss": 0.0056,
"num_tokens": 95051890.0,
"reward": 0.8839513182640075,
"reward_std": 0.04564618114382028,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8634721517562867,
"rewards/qatch_metrics/std": 0.24794530421495437,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 515
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 420.2,
"completions/max_terminated_length": 420.2,
"completions/mean_length": 193.584375,
"completions/mean_terminated_length": 193.584375,
"completions/min_length": 74.8,
"completions/min_terminated_length": 74.8,
"epoch": 0.9167033935654474,
"grad_norm": 0.3417922303720187,
"kl": 0.0196563720703125,
"learning_rate": 1e-06,
"loss": 0.0033,
"num_tokens": 95755150.0,
"reward": 0.8428452134132385,
"reward_std": 0.05727057494223118,
"rewards/format_reward/mean": 0.99921875,
"rewards/format_reward/std": 0.0125,
"rewards/qatch_metrics/mean": 0.8152039051055908,
"rewards/qatch_metrics/std": 0.31376497745513915,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 520
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 404.8,
"completions/max_terminated_length": 404.8,
"completions/mean_length": 208.72890625,
"completions/mean_terminated_length": 208.72890625,
"completions/min_length": 72.2,
"completions/min_terminated_length": 72.2,
"epoch": 0.9255178492728074,
"grad_norm": 0.17161657062686406,
"kl": 0.0185943603515625,
"learning_rate": 1e-06,
"loss": -0.0023,
"num_tokens": 96514835.0,
"reward": 0.8597602009773254,
"reward_std": 0.044371549785137174,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8350119948387146,
"rewards/qatch_metrics/std": 0.295586758852005,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 525
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 426.8,
"completions/max_terminated_length": 426.8,
"completions/mean_length": 212.95859375,
"completions/mean_terminated_length": 212.95859375,
"completions/min_length": 77.0,
"completions/min_terminated_length": 77.0,
"epoch": 0.9343323049801675,
"grad_norm": 0.22162383692372334,
"kl": 0.0186981201171875,
"learning_rate": 1e-06,
"loss": -0.002,
"num_tokens": 97270782.0,
"reward": 0.8363440155982971,
"reward_std": 0.06691965609788894,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8074635624885559,
"rewards/qatch_metrics/std": 0.3064163327217102,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 530
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 454.6,
"completions/max_terminated_length": 454.6,
"completions/mean_length": 233.40234375,
"completions/mean_terminated_length": 233.40234375,
"completions/min_length": 76.4,
"completions/min_terminated_length": 76.4,
"epoch": 0.9431467606875276,
"grad_norm": 0.1434511776519399,
"kl": 0.019879150390625,
"learning_rate": 1e-06,
"loss": 0.0023,
"num_tokens": 98016705.0,
"reward": 0.8363542199134827,
"reward_std": 0.05200971700251102,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8074755430221557,
"rewards/qatch_metrics/std": 0.2885085940361023,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 535
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 444.6,
"completions/max_terminated_length": 444.6,
"completions/mean_length": 235.70625,
"completions/mean_terminated_length": 235.70625,
"completions/min_length": 80.6,
"completions/min_terminated_length": 80.6,
"epoch": 0.9519612163948876,
"grad_norm": 0.09221258199209693,
"kl": 0.018701171875,
"learning_rate": 1e-06,
"loss": 0.0038,
"num_tokens": 98787193.0,
"reward": 0.8677037119865417,
"reward_std": 0.057669999450445174,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8443572998046875,
"rewards/qatch_metrics/std": 0.288933590054512,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 540
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 455.2,
"completions/max_terminated_length": 455.2,
"completions/mean_length": 222.11875,
"completions/mean_terminated_length": 222.11875,
"completions/min_length": 74.6,
"completions/min_terminated_length": 74.6,
"epoch": 0.9607756721022477,
"grad_norm": 0.1352237905149159,
"kl": 0.018145751953125,
"learning_rate": 1e-06,
"loss": -0.0031,
"num_tokens": 99532081.0,
"reward": 0.8805891752243042,
"reward_std": 0.05483146589249373,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8595166802406311,
"rewards/qatch_metrics/std": 0.25585181415081026,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 545
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 472.6,
"completions/max_terminated_length": 472.6,
"completions/mean_length": 218.659375,
"completions/mean_terminated_length": 218.659375,
"completions/min_length": 86.2,
"completions/min_terminated_length": 86.2,
"epoch": 0.9695901278096077,
"grad_norm": 0.16904630982662794,
"kl": 0.01783447265625,
"learning_rate": 1e-06,
"loss": 0.0016,
"num_tokens": 100246573.0,
"reward": 0.8569401383399964,
"reward_std": 0.07272802218794823,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8316942691802979,
"rewards/qatch_metrics/std": 0.3041912466287613,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 550
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 459.8,
"completions/max_terminated_length": 459.8,
"completions/mean_length": 221.78984375,
"completions/mean_terminated_length": 221.78984375,
"completions/min_length": 77.8,
"completions/min_terminated_length": 77.8,
"epoch": 0.9784045835169678,
"grad_norm": 0.31854687165087076,
"kl": 0.0183258056640625,
"learning_rate": 1e-06,
"loss": -0.0058,
"num_tokens": 100996640.0,
"reward": 0.8102917551994324,
"reward_std": 0.07570969834923744,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.7768138289451599,
"rewards/qatch_metrics/std": 0.34436498284339906,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 555
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 490.6,
"completions/max_terminated_length": 490.6,
"completions/mean_length": 230.28828125,
"completions/mean_terminated_length": 230.28828125,
"completions/min_length": 85.0,
"completions/min_terminated_length": 85.0,
"epoch": 0.9872190392243279,
"grad_norm": 0.16545798735816303,
"kl": 0.01719970703125,
"learning_rate": 1e-06,
"loss": 0.0054,
"num_tokens": 101777473.0,
"reward": 0.854366683959961,
"reward_std": 0.050544672086834906,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.8286666750907898,
"rewards/qatch_metrics/std": 0.3027670204639435,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 560
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 460.0,
"completions/max_terminated_length": 460.0,
"completions/mean_length": 232.278125,
"completions/mean_terminated_length": 232.278125,
"completions/min_length": 79.2,
"completions/min_terminated_length": 79.2,
"epoch": 0.996033494931688,
"grad_norm": 0.2064718967348405,
"kl": 0.020306396484375,
"learning_rate": 1e-06,
"loss": -0.0052,
"num_tokens": 102547669.0,
"reward": 0.7918175339698792,
"reward_std": 0.05684706475585699,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.755079448223114,
"rewards/qatch_metrics/std": 0.3250477254390717,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 565
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 468.5,
"completions/max_terminated_length": 468.5,
"completions/mean_length": 214.265625,
"completions/mean_terminated_length": 214.265625,
"completions/min_length": 66.0,
"completions/min_terminated_length": 66.0,
"epoch": 0.999559277214632,
"kl": 0.01806640625,
"num_tokens": 102823629.0,
"reward": 0.8797399699687958,
"reward_std": 0.056224397383630276,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.858517587184906,
"rewards/qatch_metrics/std": 0.26497258245944977,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"step": 567,
"total_flos": 0.0,
"train_loss": -1.6490349831877564e-05,
"train_runtime": 5804.9117,
"train_samples_per_second": 1.564,
"train_steps_per_second": 0.098
}
],
"logging_steps": 5,
"max_steps": 567,
"num_input_tokens_seen": 102823629,
"num_train_epochs": 1,
"save_steps": 5,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}