|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.984, |
|
"eval_steps": 100, |
|
"global_step": 246, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 583.2621688842773, |
|
"epoch": 0.12, |
|
"grad_norm": 0.12920165061950684, |
|
"kl": 0.0007047017415364583, |
|
"learning_rate": 4e-07, |
|
"loss": 0.0039, |
|
"reward": 0.07673611293236414, |
|
"reward_std": 0.12897611850251753, |
|
"rewards/accuracy_reward": 0.07100694642091791, |
|
"rewards/format_reward": 0.0057291668374091385, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 576.849148050944, |
|
"epoch": 0.24, |
|
"grad_norm": 0.14590902626514435, |
|
"kl": 0.0013418197631835938, |
|
"learning_rate": 8e-07, |
|
"loss": 0.0005, |
|
"reward": 0.0871527800646921, |
|
"reward_std": 0.13713855588187773, |
|
"rewards/accuracy_reward": 0.081250002173086, |
|
"rewards/format_reward": 0.005902777938172221, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 595.0527969360352, |
|
"epoch": 0.36, |
|
"grad_norm": 0.13242916762828827, |
|
"kl": 0.0022638956705729168, |
|
"learning_rate": 1.2e-06, |
|
"loss": 0.0035, |
|
"reward": 0.08645833590999245, |
|
"reward_std": 0.12982427552342415, |
|
"rewards/accuracy_reward": 0.07795139101023475, |
|
"rewards/format_reward": 0.008506944651405017, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 589.0404724121094, |
|
"epoch": 0.48, |
|
"grad_norm": 0.19261103868484497, |
|
"kl": 0.006444295247395833, |
|
"learning_rate": 1.6e-06, |
|
"loss": 0.0083, |
|
"reward": 0.12013889234513045, |
|
"reward_std": 0.17057897535463173, |
|
"rewards/accuracy_reward": 0.10694444783342381, |
|
"rewards/format_reward": 0.013194444729015231, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 600.2750193277994, |
|
"epoch": 0.6, |
|
"grad_norm": 0.18090881407260895, |
|
"kl": 0.00933685302734375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.011, |
|
"reward": 0.12621528124436737, |
|
"reward_std": 0.1751370935390393, |
|
"rewards/accuracy_reward": 0.11458333623595536, |
|
"rewards/format_reward": 0.011631944697971146, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 577.3231079101563, |
|
"epoch": 0.72, |
|
"grad_norm": 0.14372943341732025, |
|
"kl": 0.009203084309895833, |
|
"learning_rate": 1.9974751105436262e-06, |
|
"loss": 0.0063, |
|
"reward": 0.13611111496575176, |
|
"reward_std": 0.17103372573231657, |
|
"rewards/accuracy_reward": 0.1302083367947489, |
|
"rewards/format_reward": 0.005902777938172221, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 564.1434224446615, |
|
"epoch": 0.84, |
|
"grad_norm": 0.15495018661022186, |
|
"kl": 0.009105428059895834, |
|
"learning_rate": 1.98991319230804e-06, |
|
"loss": 0.0107, |
|
"reward": 0.1550347256163756, |
|
"reward_std": 0.19651179468880098, |
|
"rewards/accuracy_reward": 0.14583333718279998, |
|
"rewards/format_reward": 0.009201389101023475, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 600.4140803019205, |
|
"epoch": 0.96, |
|
"grad_norm": 0.17615483701229095, |
|
"kl": 0.012515767415364584, |
|
"learning_rate": 1.9773524313084854e-06, |
|
"loss": 0.0119, |
|
"reward": 0.15885417150954406, |
|
"reward_std": 0.2019161203255256, |
|
"rewards/accuracy_reward": 0.14166667039195696, |
|
"rewards/format_reward": 0.017187500388051072, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 631.5385569254557, |
|
"epoch": 1.096, |
|
"grad_norm": 0.16588236391544342, |
|
"kl": 0.014839680989583333, |
|
"learning_rate": 1.959856256610988e-06, |
|
"loss": 0.0096, |
|
"reward": 0.18350694837669532, |
|
"reward_std": 0.2348036120335261, |
|
"rewards/accuracy_reward": 0.15277778275000553, |
|
"rewards/format_reward": 0.030729167473812897, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 593.8875162760417, |
|
"epoch": 1.216, |
|
"grad_norm": 0.18021586537361145, |
|
"kl": 0.016481526692708335, |
|
"learning_rate": 1.9375130200295876e-06, |
|
"loss": 0.0189, |
|
"reward": 0.19756944961845874, |
|
"reward_std": 0.24355731457471846, |
|
"rewards/accuracy_reward": 0.16006944881131252, |
|
"rewards/format_reward": 0.03750000107102096, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 587.4776229858398, |
|
"epoch": 1.336, |
|
"grad_norm": 0.1994011402130127, |
|
"kl": 0.0164459228515625, |
|
"learning_rate": 1.9104355499692162e-06, |
|
"loss": 0.0176, |
|
"reward": 0.2189236176510652, |
|
"reward_std": 0.28772813665370145, |
|
"rewards/accuracy_reward": 0.14583333767950535, |
|
"rewards/format_reward": 0.07309028026647865, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 575.6356079101563, |
|
"epoch": 1.456, |
|
"grad_norm": 0.23497304320335388, |
|
"kl": 0.02615966796875, |
|
"learning_rate": 1.8787605816671951e-06, |
|
"loss": 0.0222, |
|
"reward": 0.33281251018246016, |
|
"reward_std": 0.3787623425324758, |
|
"rewards/accuracy_reward": 0.13923611460874477, |
|
"rewards/format_reward": 0.19357639501492183, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 581.5133870442709, |
|
"epoch": 1.576, |
|
"grad_norm": 0.2583252191543579, |
|
"kl": 0.039479573567708336, |
|
"learning_rate": 1.8426480667105175e-06, |
|
"loss": 0.0407, |
|
"reward": 0.4626736263434092, |
|
"reward_std": 0.45475957343975704, |
|
"rewards/accuracy_reward": 0.13229167129223546, |
|
"rewards/format_reward": 0.33038195346792537, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 575.3401219685873, |
|
"epoch": 1.696, |
|
"grad_norm": 0.2477940022945404, |
|
"kl": 0.04772135416666667, |
|
"learning_rate": 1.8022803653156982e-06, |
|
"loss": 0.037, |
|
"reward": 0.6152777954936027, |
|
"reward_std": 0.4820722574989001, |
|
"rewards/accuracy_reward": 0.11440972487131755, |
|
"rewards/format_reward": 0.5008680661519368, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 592.2732818603515, |
|
"epoch": 1.8159999999999998, |
|
"grad_norm": 0.240928515791893, |
|
"kl": 0.045609537760416666, |
|
"learning_rate": 1.7578613254499968e-06, |
|
"loss": 0.0367, |
|
"reward": 0.6859375188748041, |
|
"reward_std": 0.48161858022212983, |
|
"rewards/accuracy_reward": 0.1130208361428231, |
|
"rewards/format_reward": 0.5729166840513548, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 570.3640808105469, |
|
"epoch": 1.936, |
|
"grad_norm": 0.30649441480636597, |
|
"kl": 0.05423177083333333, |
|
"learning_rate": 1.7096152534442513e-06, |
|
"loss": 0.0334, |
|
"reward": 0.7704861332972844, |
|
"reward_std": 0.4483942608038584, |
|
"rewards/accuracy_reward": 0.10572916980211934, |
|
"rewards/format_reward": 0.6647569671273231, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 560.8760559082032, |
|
"epoch": 2.072, |
|
"grad_norm": 0.25217875838279724, |
|
"kl": 0.054768880208333336, |
|
"learning_rate": 1.6577857812954991e-06, |
|
"loss": 0.0375, |
|
"reward": 0.8114583571751912, |
|
"reward_std": 0.4467007691661517, |
|
"rewards/accuracy_reward": 0.12256944736776253, |
|
"rewards/format_reward": 0.6888889064391454, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 565.7517557779948, |
|
"epoch": 2.192, |
|
"grad_norm": 0.27332058548927307, |
|
"kl": 0.0546630859375, |
|
"learning_rate": 1.6026346363792564e-06, |
|
"loss": 0.0329, |
|
"reward": 0.8633680770794551, |
|
"reward_std": 0.47069497853517533, |
|
"rewards/accuracy_reward": 0.15156250428408385, |
|
"rewards/format_reward": 0.711805577079455, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 564.6762344360352, |
|
"epoch": 2.312, |
|
"grad_norm": 0.22232523560523987, |
|
"kl": 0.05868123372395833, |
|
"learning_rate": 1.5444403197841344e-06, |
|
"loss": 0.0211, |
|
"reward": 0.8944444666306178, |
|
"reward_std": 0.42260901977618537, |
|
"rewards/accuracy_reward": 0.1461805594774584, |
|
"rewards/format_reward": 0.7482639104127884, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 2.432, |
|
"grad_norm": 0.21494214236736298, |
|
"learning_rate": 1.4834966999429178e-06, |
|
"loss": 0.0271, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.432, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 528.4134756234976, |
|
"eval_kl": 0.060819185697115384, |
|
"eval_loss": 0.013302656821906567, |
|
"eval_reward": 0.97275644999284, |
|
"eval_reward_std": 0.405555764069924, |
|
"eval_rewards/accuracy_reward": 0.16025641531898424, |
|
"eval_rewards/format_reward": 0.8125000275098361, |
|
"eval_runtime": 164.4929, |
|
"eval_samples_per_second": 0.602, |
|
"eval_steps_per_second": 0.018, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 562.4533172607422, |
|
"epoch": 2.552, |
|
"grad_norm": 0.24463680386543274, |
|
"kl": 0.06415608723958334, |
|
"learning_rate": 1.4201115286619464e-06, |
|
"loss": 0.0246, |
|
"reward": 0.9512153029441833, |
|
"reward_std": 0.4153951602677504, |
|
"rewards/accuracy_reward": 0.16093750478078922, |
|
"rewards/format_reward": 0.7902777964870135, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 558.5052256266276, |
|
"epoch": 2.672, |
|
"grad_norm": 0.22716720402240753, |
|
"kl": 0.066162109375, |
|
"learning_rate": 1.3546048870425354e-06, |
|
"loss": 0.0242, |
|
"reward": 0.9326389094193777, |
|
"reward_std": 0.4124096731344859, |
|
"rewards/accuracy_reward": 0.15625000453243654, |
|
"rewards/format_reward": 0.7763889074325562, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 547.4661656697591, |
|
"epoch": 2.792, |
|
"grad_norm": 0.2325204312801361, |
|
"kl": 0.07105712890625, |
|
"learning_rate": 1.2873075691421806e-06, |
|
"loss": 0.0198, |
|
"reward": 0.9222222457329432, |
|
"reward_std": 0.4309779698650042, |
|
"rewards/accuracy_reward": 0.16579861616094907, |
|
"rewards/format_reward": 0.7564236293236415, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 547.4175537109375, |
|
"epoch": 2.912, |
|
"grad_norm": 0.23773518204689026, |
|
"kl": 0.06711832682291667, |
|
"learning_rate": 1.218559411537699e-06, |
|
"loss": 0.0211, |
|
"reward": 0.894270858168602, |
|
"reward_std": 0.41917893588542937, |
|
"rewards/accuracy_reward": 0.15920139361793798, |
|
"rewards/format_reward": 0.7350694666306178, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 544.4187637329102, |
|
"epoch": 3.048, |
|
"grad_norm": 0.25982293486595154, |
|
"kl": 0.07174072265625, |
|
"learning_rate": 1.1487075772256517e-06, |
|
"loss": 0.029, |
|
"reward": 0.8729166895151138, |
|
"reward_std": 0.4223095287879308, |
|
"rewards/accuracy_reward": 0.16006944874922435, |
|
"rewards/format_reward": 0.7128472457329432, |
|
"step": 125 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 520.4824803670248, |
|
"epoch": 3.168, |
|
"grad_norm": 0.24558736383914948, |
|
"kl": 0.08404541015625, |
|
"learning_rate": 1.0781048025259646e-06, |
|
"loss": 0.0261, |
|
"reward": 0.8881944636503856, |
|
"reward_std": 0.42243550966183346, |
|
"rewards/accuracy_reward": 0.16284722611308097, |
|
"rewards/format_reward": 0.725347238779068, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 537.1248448689779, |
|
"epoch": 3.288, |
|
"grad_norm": 0.29106763005256653, |
|
"kl": 0.07824300130208334, |
|
"learning_rate": 1.0071076158414974e-06, |
|
"loss": 0.0249, |
|
"reward": 0.9265625198682149, |
|
"reward_std": 0.41332067002852757, |
|
"rewards/accuracy_reward": 0.17378472685813903, |
|
"rewards/format_reward": 0.7527777969837188, |
|
"step": 135 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 550.350016784668, |
|
"epoch": 3.408, |
|
"grad_norm": 0.2443486452102661, |
|
"kl": 0.071923828125, |
|
"learning_rate": 9.360745372684345e-07, |
|
"loss": 0.0295, |
|
"reward": 0.8960069666306177, |
|
"reward_std": 0.4272393837571144, |
|
"rewards/accuracy_reward": 0.17343750571211178, |
|
"rewards/format_reward": 0.7225694636503855, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 543.1215454101563, |
|
"epoch": 3.528, |
|
"grad_norm": 0.24748581647872925, |
|
"kl": 0.07239176432291666, |
|
"learning_rate": 8.653642681490607e-07, |
|
"loss": 0.0247, |
|
"reward": 0.9074653029441834, |
|
"reward_std": 0.4173097605506579, |
|
"rewards/accuracy_reward": 0.1642361162851254, |
|
"rewards/format_reward": 0.7432291815678279, |
|
"step": 145 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 539.6696355183919, |
|
"epoch": 3.648, |
|
"grad_norm": 0.2420652210712433, |
|
"kl": 0.0806884765625, |
|
"learning_rate": 7.953338797092901e-07, |
|
"loss": 0.0247, |
|
"reward": 0.9553819715976715, |
|
"reward_std": 0.39353689054648083, |
|
"rewards/accuracy_reward": 0.15902778361923992, |
|
"rewards/format_reward": 0.7963541895151138, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 542.0953282674153, |
|
"epoch": 3.768, |
|
"grad_norm": 0.22927226126194, |
|
"kl": 0.08297119140625, |
|
"learning_rate": 7.263370099279171e-07, |
|
"loss": 0.0375, |
|
"reward": 0.9421875288089117, |
|
"reward_std": 0.3737917934854825, |
|
"rewards/accuracy_reward": 0.14618056000520785, |
|
"rewards/format_reward": 0.7960069606701533, |
|
"step": 155 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 560.2560948689778, |
|
"epoch": 3.888, |
|
"grad_norm": 0.2442265897989273, |
|
"kl": 0.08262532552083333, |
|
"learning_rate": 6.587220777430095e-07, |
|
"loss": 0.0438, |
|
"reward": 0.9187500218550364, |
|
"reward_std": 0.3985011622309685, |
|
"rewards/accuracy_reward": 0.1730902827034394, |
|
"rewards/format_reward": 0.7456597417593003, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 549.7503621419271, |
|
"epoch": 4.024, |
|
"grad_norm": 0.36080020666122437, |
|
"kl": 0.09295247395833334, |
|
"learning_rate": 5.928305236133016e-07, |
|
"loss": 0.0348, |
|
"reward": 0.8918403009573619, |
|
"reward_std": 0.42846539815266926, |
|
"rewards/accuracy_reward": 0.16701389451821644, |
|
"rewards/format_reward": 0.7248264094193776, |
|
"step": 165 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 527.6090423583985, |
|
"epoch": 4.144, |
|
"grad_norm": 0.2888505756855011, |
|
"kl": 0.0927978515625, |
|
"learning_rate": 5.289950853193652e-07, |
|
"loss": 0.0569, |
|
"reward": 0.9263889143864313, |
|
"reward_std": 0.4129257212082545, |
|
"rewards/accuracy_reward": 0.17309028257926304, |
|
"rewards/format_reward": 0.75329862733682, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 521.2873443603515, |
|
"epoch": 4.264, |
|
"grad_norm": 0.2377632111310959, |
|
"kl": 0.09527180989583334, |
|
"learning_rate": 4.6753811771138365e-07, |
|
"loss": 0.0401, |
|
"reward": 0.9312500258286794, |
|
"reward_std": 0.3798152153690656, |
|
"rewards/accuracy_reward": 0.16128472660978635, |
|
"rewards/format_reward": 0.7699652989705403, |
|
"step": 175 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 545.0975886027019, |
|
"epoch": 4.384, |
|
"grad_norm": 0.2456796020269394, |
|
"kl": 0.093505859375, |
|
"learning_rate": 4.0876996488842475e-07, |
|
"loss": 0.0539, |
|
"reward": 0.9411458532015483, |
|
"reward_std": 0.40113388895988467, |
|
"rewards/accuracy_reward": 0.17621528282761573, |
|
"rewards/format_reward": 0.7649305760860443, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 552.6007125854492, |
|
"epoch": 4.504, |
|
"grad_norm": 0.24820686876773834, |
|
"kl": 0.09252522786458334, |
|
"learning_rate": 3.529873930293545e-07, |
|
"loss": 0.0572, |
|
"reward": 0.9163194666306178, |
|
"reward_std": 0.38814649879932406, |
|
"rewards/accuracy_reward": 0.15885417160267631, |
|
"rewards/format_reward": 0.7574652969837189, |
|
"step": 185 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 533.9510559082031, |
|
"epoch": 4.624, |
|
"grad_norm": 0.2605077922344208, |
|
"kl": 0.09794921875, |
|
"learning_rate": 3.0047209178924635e-07, |
|
"loss": 0.0484, |
|
"reward": 0.9178819686174393, |
|
"reward_std": 0.4066275705893834, |
|
"rewards/accuracy_reward": 0.16649306093653043, |
|
"rewards/format_reward": 0.7513889104127884, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 535.4948069254557, |
|
"epoch": 4.744, |
|
"grad_norm": 0.24027736485004425, |
|
"kl": 0.10040690104166666, |
|
"learning_rate": 2.514892518288988e-07, |
|
"loss": 0.0508, |
|
"reward": 0.9135416855414709, |
|
"reward_std": 0.3980190739035606, |
|
"rewards/accuracy_reward": 0.1682291696468989, |
|
"rewards/format_reward": 0.7453125208616257, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 4.864, |
|
"grad_norm": 0.24247297644615173, |
|
"learning_rate": 2.0628622566063058e-07, |
|
"loss": 0.0556, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.864, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 497.66507427509015, |
|
"eval_kl": 0.10235126201923077, |
|
"eval_loss": 0.09290527552366257, |
|
"eval_reward": 0.8525641239606417, |
|
"eval_reward_std": 0.4311282199162703, |
|
"eval_rewards/accuracy_reward": 0.1314102616161108, |
|
"eval_rewards/format_reward": 0.7211538690787095, |
|
"eval_runtime": 164.8378, |
|
"eval_samples_per_second": 0.601, |
|
"eval_steps_per_second": 0.018, |
|
"step": 200 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 527.5610371907552, |
|
"epoch": 4.984, |
|
"grad_norm": 0.24340181052684784, |
|
"kl": 0.103265380859375, |
|
"learning_rate": 1.6509127857277782e-07, |
|
"loss": 0.0592, |
|
"reward": 0.9018229390184085, |
|
"reward_std": 0.38689753947158656, |
|
"rewards/accuracy_reward": 0.16232639336958526, |
|
"rewards/format_reward": 0.7394965469837189, |
|
"step": 205 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 535.6948079427083, |
|
"epoch": 5.12, |
|
"grad_norm": 0.2896415889263153, |
|
"kl": 0.10675455729166666, |
|
"learning_rate": 1.2811243594045694e-07, |
|
"loss": 0.059, |
|
"reward": 0.8942708512147267, |
|
"reward_std": 0.4014328221480052, |
|
"rewards/accuracy_reward": 0.15416667043852308, |
|
"rewards/format_reward": 0.7401041835546494, |
|
"step": 210 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 525.1632136027018, |
|
"epoch": 5.24, |
|
"grad_norm": 0.2585560977458954, |
|
"kl": 0.10707194010416667, |
|
"learning_rate": 9.55364327434105e-08, |
|
"loss": 0.0576, |
|
"reward": 0.8967014074325561, |
|
"reward_std": 0.3964859182635943, |
|
"rewards/accuracy_reward": 0.1552083384245634, |
|
"rewards/format_reward": 0.7414930721124013, |
|
"step": 215 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 518.6449793497721, |
|
"epoch": 5.36, |
|
"grad_norm": 0.2638151943683624, |
|
"kl": 0.110107421875, |
|
"learning_rate": 6.75277705956443e-08, |
|
"loss": 0.0621, |
|
"reward": 0.910069465637207, |
|
"reward_std": 0.38575134972731273, |
|
"rewards/accuracy_reward": 0.1703125045945247, |
|
"rewards/format_reward": 0.739756965637207, |
|
"step": 220 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 519.6583480834961, |
|
"epoch": 5.48, |
|
"grad_norm": 0.2671918570995331, |
|
"kl": 0.10970865885416667, |
|
"learning_rate": 4.422788704864633e-08, |
|
"loss": 0.0555, |
|
"reward": 0.9017361313104629, |
|
"reward_std": 0.39265564555923144, |
|
"rewards/accuracy_reward": 0.16128472667187452, |
|
"rewards/format_reward": 0.7404514094193776, |
|
"step": 225 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 518.6614771525066, |
|
"epoch": 5.6, |
|
"grad_norm": 0.2516481876373291, |
|
"kl": 0.10393880208333334, |
|
"learning_rate": 2.575444136302185e-08, |
|
"loss": 0.0585, |
|
"reward": 0.9203125298023224, |
|
"reward_std": 0.3854361062248548, |
|
"rewards/accuracy_reward": 0.16527778220673403, |
|
"rewards/format_reward": 0.7550347407658895, |
|
"step": 230 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 523.0328257242838, |
|
"epoch": 5.72, |
|
"grad_norm": 0.24842773377895355, |
|
"kl": 0.10460611979166666, |
|
"learning_rate": 1.220072035523989e-08, |
|
"loss": 0.0615, |
|
"reward": 0.9104166885217031, |
|
"reward_std": 0.4020949920018514, |
|
"rewards/accuracy_reward": 0.16076389284183581, |
|
"rewards/format_reward": 0.7496527930100759, |
|
"step": 235 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 548.0658167521159, |
|
"epoch": 5.84, |
|
"grad_norm": 0.2960110008716583, |
|
"kl": 0.10340983072916667, |
|
"learning_rate": 3.6351673198347087e-09, |
|
"loss": 0.0572, |
|
"reward": 0.894097242752711, |
|
"reward_std": 0.39487800349791846, |
|
"rewards/accuracy_reward": 0.15312500384946665, |
|
"rewards/format_reward": 0.740972242752711, |
|
"step": 240 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 524.3946365356445, |
|
"epoch": 5.96, |
|
"grad_norm": 0.28276899456977844, |
|
"kl": 0.1115234375, |
|
"learning_rate": 1.0103640590064522e-10, |
|
"loss": 0.0614, |
|
"reward": 0.8968750178813935, |
|
"reward_std": 0.40502374321222306, |
|
"rewards/accuracy_reward": 0.15798611640930177, |
|
"rewards/format_reward": 0.7388889054457347, |
|
"step": 245 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 484.8463541666667, |
|
"epoch": 5.984, |
|
"kl": 0.11258951822916667, |
|
"reward": 0.8923611293236414, |
|
"reward_std": 0.40955925981203717, |
|
"rewards/accuracy_reward": 0.1605902835726738, |
|
"rewards/format_reward": 0.731770858168602, |
|
"step": 246, |
|
"total_flos": 0.0, |
|
"train_loss": 0.03362821891328426, |
|
"train_runtime": 77437.8148, |
|
"train_samples_per_second": 0.62, |
|
"train_steps_per_second": 0.003 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 246, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 12, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|