|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.9893390191897655, |
|
"eval_steps": 100, |
|
"global_step": 232, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 445.9810485839844, |
|
"epoch": 0.017057569296375266, |
|
"grad_norm": 0.4470120966434479, |
|
"kl": 0.0, |
|
"learning_rate": 1.25e-07, |
|
"loss": -0.0061, |
|
"reward": 0.2678571566939354, |
|
"reward_std": 0.3369061201810837, |
|
"rewards/accuracy_reward": 0.2154017947614193, |
|
"rewards/format_reward": 0.05245535937137902, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 457.7648096084595, |
|
"epoch": 0.08528784648187633, |
|
"grad_norm": 0.46972450613975525, |
|
"kl": 4.832446575164795e-05, |
|
"learning_rate": 6.25e-07, |
|
"loss": -0.0007, |
|
"reward": 0.2547433148138225, |
|
"reward_std": 0.3450065036304295, |
|
"rewards/accuracy_reward": 0.19698661542497575, |
|
"rewards/format_reward": 0.05775669927243143, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 449.00381622314455, |
|
"epoch": 0.17057569296375266, |
|
"grad_norm": 0.4081119894981384, |
|
"kl": 0.0028772354125976562, |
|
"learning_rate": 1.25e-06, |
|
"loss": 0.0138, |
|
"reward": 0.3084821570664644, |
|
"reward_std": 0.38891434073448183, |
|
"rewards/accuracy_reward": 0.2015625100582838, |
|
"rewards/format_reward": 0.10691964821889996, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 445.6658676147461, |
|
"epoch": 0.255863539445629, |
|
"grad_norm": 0.8681241273880005, |
|
"kl": 0.05352325439453125, |
|
"learning_rate": 1.875e-06, |
|
"loss": 0.0257, |
|
"reward": 0.6531250305473805, |
|
"reward_std": 0.5433857575058937, |
|
"rewards/accuracy_reward": 0.25892858393490314, |
|
"rewards/format_reward": 0.3941964443773031, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 469.591983795166, |
|
"epoch": 0.3411513859275053, |
|
"grad_norm": 0.5079839825630188, |
|
"kl": 0.0329986572265625, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.0307, |
|
"reward": 1.051785758137703, |
|
"reward_std": 0.5734383672475815, |
|
"rewards/accuracy_reward": 0.4296875223517418, |
|
"rewards/format_reward": 0.6220982387661934, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 494.53462371826174, |
|
"epoch": 0.42643923240938164, |
|
"grad_norm": 0.7165977954864502, |
|
"kl": 0.0740936279296875, |
|
"learning_rate": 2.999828909426247e-06, |
|
"loss": 0.046, |
|
"reward": 1.3908482611179351, |
|
"reward_std": 0.45604070723056794, |
|
"rewards/accuracy_reward": 0.5560268141329289, |
|
"rewards/format_reward": 0.8348214745521545, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 466.7975654602051, |
|
"epoch": 0.511727078891258, |
|
"grad_norm": 105.83580780029297, |
|
"kl": 0.15655517578125, |
|
"learning_rate": 2.9938448364256362e-06, |
|
"loss": 0.0562, |
|
"reward": 1.4703125596046447, |
|
"reward_std": 0.39982456117868426, |
|
"rewards/accuracy_reward": 0.5502232410013675, |
|
"rewards/format_reward": 0.9200893312692642, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 506.7870819091797, |
|
"epoch": 0.5970149253731343, |
|
"grad_norm": 185.41656494140625, |
|
"kl": 0.2352294921875, |
|
"learning_rate": 2.979345224048116e-06, |
|
"loss": 0.1242, |
|
"reward": 1.4352679133415223, |
|
"reward_std": 0.4489331416785717, |
|
"rewards/accuracy_reward": 0.5587053798139096, |
|
"rewards/format_reward": 0.8765625402331352, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 534.5277046203613, |
|
"epoch": 0.6823027718550106, |
|
"grad_norm": 174158.28125, |
|
"kl": 1967.46162109375, |
|
"learning_rate": 2.956412726139078e-06, |
|
"loss": 65.1453, |
|
"reward": 1.3604911327362061, |
|
"reward_std": 0.5219059348106384, |
|
"rewards/accuracy_reward": 0.557812524586916, |
|
"rewards/format_reward": 0.8026786133646965, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 597.9991287231445, |
|
"epoch": 0.767590618336887, |
|
"grad_norm": 19395.80078125, |
|
"kl": 781.421875, |
|
"learning_rate": 2.925178067512904e-06, |
|
"loss": 24.4654, |
|
"reward": 1.210491119325161, |
|
"reward_std": 0.614486200362444, |
|
"rewards/accuracy_reward": 0.5167410962283612, |
|
"rewards/format_reward": 0.6937500357627868, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 663.8980178833008, |
|
"epoch": 0.8528784648187633, |
|
"grad_norm": 12.801368713378906, |
|
"kl": 8.7136962890625, |
|
"learning_rate": 2.88581929876693e-06, |
|
"loss": 0.4607, |
|
"reward": 0.9968750447034835, |
|
"reward_std": 0.6696360170841217, |
|
"rewards/accuracy_reward": 0.4488839447498322, |
|
"rewards/format_reward": 0.547991094738245, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 697.5154357910156, |
|
"epoch": 0.9381663113006397, |
|
"grad_norm": 38.23894119262695, |
|
"kl": 0.28189697265625, |
|
"learning_rate": 2.8385607813186967e-06, |
|
"loss": 0.2599, |
|
"reward": 0.9542411118745804, |
|
"reward_std": 0.6912495926022529, |
|
"rewards/accuracy_reward": 0.4694196626543999, |
|
"rewards/format_reward": 0.4848214492201805, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 649.4566436767578, |
|
"epoch": 1.0341151385927505, |
|
"grad_norm": 313.91156005859375, |
|
"kl": 4.05673828125, |
|
"learning_rate": 2.7836719084521715e-06, |
|
"loss": 0.5276, |
|
"reward": 1.1421875596046447, |
|
"reward_std": 0.7461830854415894, |
|
"rewards/accuracy_reward": 0.529910734295845, |
|
"rewards/format_reward": 0.6122768133878708, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 657.656948852539, |
|
"epoch": 1.1194029850746268, |
|
"grad_norm": 83.17515563964844, |
|
"kl": 4.4130859375, |
|
"learning_rate": 2.7214655696635407e-06, |
|
"loss": 0.5723, |
|
"reward": 1.0674107655882836, |
|
"reward_std": 0.7840202301740646, |
|
"rewards/accuracy_reward": 0.4863839507102966, |
|
"rewards/format_reward": 0.5810268193483352, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 586.2384178161622, |
|
"epoch": 1.2046908315565032, |
|
"grad_norm": 15.149043083190918, |
|
"kl": 2.45390625, |
|
"learning_rate": 2.652296367060421e-06, |
|
"loss": 0.4104, |
|
"reward": 1.256919699907303, |
|
"reward_std": 0.7236043408513069, |
|
"rewards/accuracy_reward": 0.5531250238418579, |
|
"rewards/format_reward": 0.703794677555561, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 574.1111854553222, |
|
"epoch": 1.2899786780383795, |
|
"grad_norm": 21.18400764465332, |
|
"kl": 9.9931640625, |
|
"learning_rate": 2.5765585939817676e-06, |
|
"loss": 0.744, |
|
"reward": 1.3468750655651092, |
|
"reward_std": 0.6585553318262101, |
|
"rewards/accuracy_reward": 0.5841518118977547, |
|
"rewards/format_reward": 0.7627232521772385, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 564.7518135070801, |
|
"epoch": 1.375266524520256, |
|
"grad_norm": 486.4627990722656, |
|
"kl": 8.6583984375, |
|
"learning_rate": 2.4946839873611927e-06, |
|
"loss": 0.531, |
|
"reward": 1.4149554252624512, |
|
"reward_std": 0.6240757808089257, |
|
"rewards/accuracy_reward": 0.6238839633762836, |
|
"rewards/format_reward": 0.7910714626312256, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 565.4381927490234, |
|
"epoch": 1.4605543710021323, |
|
"grad_norm": 88.8906021118164, |
|
"kl": 2.0404296875, |
|
"learning_rate": 2.4071392666461563e-06, |
|
"loss": 0.1685, |
|
"reward": 1.3975447028875352, |
|
"reward_std": 0.6175315536558628, |
|
"rewards/accuracy_reward": 0.6136160999536514, |
|
"rewards/format_reward": 0.7839286029338837, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 569.019669342041, |
|
"epoch": 1.5458422174840085, |
|
"grad_norm": 17.572298049926758, |
|
"kl": 1.3623046875, |
|
"learning_rate": 2.314423473302218e-06, |
|
"loss": 0.0949, |
|
"reward": 1.3495536297559738, |
|
"reward_std": 0.6313827067613602, |
|
"rewards/accuracy_reward": 0.5805803835391998, |
|
"rewards/format_reward": 0.7689732491970063, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 589.0908721923828, |
|
"epoch": 1.6311300639658848, |
|
"grad_norm": 219.19049072265625, |
|
"kl": 3.03681640625, |
|
"learning_rate": 2.2170651260682927e-06, |
|
"loss": 0.178, |
|
"reward": 1.3450893461704254, |
|
"reward_std": 0.6524915762245656, |
|
"rewards/accuracy_reward": 0.5834821671247482, |
|
"rewards/format_reward": 0.7616071730852128, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.716417910447761, |
|
"grad_norm": 9490.1796875, |
|
"learning_rate": 2.1156192081791355e-06, |
|
"loss": 5.6995, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.716417910447761, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 583.028025374245, |
|
"eval_kl": 8.038014177316294, |
|
"eval_loss": 0.30782344937324524, |
|
"eval_reward": 1.30117531782522, |
|
"eval_reward_std": 0.6690196339695599, |
|
"eval_rewards/accuracy_reward": 0.538081950558641, |
|
"eval_rewards/format_reward": 0.7630933717416879, |
|
"eval_runtime": 4660.7236, |
|
"eval_samples_per_second": 1.073, |
|
"eval_steps_per_second": 0.01, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 589.2927730560302, |
|
"epoch": 1.8017057569296375, |
|
"grad_norm": 36.098472595214844, |
|
"kl": 46.948193359375, |
|
"learning_rate": 2.010664003729149e-06, |
|
"loss": 0.191, |
|
"reward": 1.341964341700077, |
|
"reward_std": 0.666185948625207, |
|
"rewards/accuracy_reward": 0.579464315995574, |
|
"rewards/format_reward": 0.7625000335276126, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 572.2181030273438, |
|
"epoch": 1.886993603411514, |
|
"grad_norm": 33.85002517700195, |
|
"kl": 1.8282470703125, |
|
"learning_rate": 1.9027978012115653e-06, |
|
"loss": 0.1948, |
|
"reward": 1.3232143372297287, |
|
"reward_std": 0.6623021572828293, |
|
"rewards/accuracy_reward": 0.5607143074274064, |
|
"rewards/format_reward": 0.7625000298023223, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 542.2167655944825, |
|
"epoch": 1.9722814498933903, |
|
"grad_norm": 225.037109375, |
|
"kl": 0.3978759765625, |
|
"learning_rate": 1.7926354830241926e-06, |
|
"loss": 0.0266, |
|
"reward": 1.3252232730388642, |
|
"reward_std": 0.6835980974137783, |
|
"rewards/accuracy_reward": 0.5741071701049805, |
|
"rewards/format_reward": 0.7511161074042321, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 525.6891525268554, |
|
"epoch": 2.068230277185501, |
|
"grad_norm": 7.967476844787598, |
|
"kl": 0.974072265625, |
|
"learning_rate": 1.6808050203829845e-06, |
|
"loss": 0.0252, |
|
"reward": 1.304241132736206, |
|
"reward_std": 0.6886323638260364, |
|
"rewards/accuracy_reward": 0.5553571663796901, |
|
"rewards/format_reward": 0.7488839626312256, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 490.7185485839844, |
|
"epoch": 2.1535181236673773, |
|
"grad_norm": 2670.3564453125, |
|
"kl": 3.83818359375, |
|
"learning_rate": 1.5679438936238768e-06, |
|
"loss": 0.0711, |
|
"reward": 1.361607202887535, |
|
"reward_std": 0.621609278768301, |
|
"rewards/accuracy_reward": 0.5705357417464256, |
|
"rewards/format_reward": 0.7910714641213417, |
|
"step": 125 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 489.616316986084, |
|
"epoch": 2.2388059701492535, |
|
"grad_norm": 92.28889465332031, |
|
"kl": 2.853662109375, |
|
"learning_rate": 1.454695458298667e-06, |
|
"loss": 0.0984, |
|
"reward": 1.4082589894533157, |
|
"reward_std": 0.5909771449863911, |
|
"rewards/accuracy_reward": 0.5848214507102967, |
|
"rewards/format_reward": 0.8234375342726707, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 480.3973449707031, |
|
"epoch": 2.3240938166311302, |
|
"grad_norm": 636.6416015625, |
|
"kl": 1.462548828125, |
|
"learning_rate": 1.341705277779715e-06, |
|
"loss": 0.0403, |
|
"reward": 1.3687500536441803, |
|
"reward_std": 0.5565730266273021, |
|
"rewards/accuracy_reward": 0.5379464507102967, |
|
"rewards/format_reward": 0.8308036103844643, |
|
"step": 135 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 473.5602897644043, |
|
"epoch": 2.4093816631130065, |
|
"grad_norm": 175.14183044433594, |
|
"kl": 1.86015625, |
|
"learning_rate": 1.2296174432791415e-06, |
|
"loss": 0.0452, |
|
"reward": 1.3828125596046448, |
|
"reward_std": 0.5798827618360519, |
|
"rewards/accuracy_reward": 0.5468750238418579, |
|
"rewards/format_reward": 0.8359375432133674, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 460.103589630127, |
|
"epoch": 2.4946695095948828, |
|
"grad_norm": 2649.130859375, |
|
"kl": 2.87041015625, |
|
"learning_rate": 1.1190709022599545e-06, |
|
"loss": 0.0699, |
|
"reward": 1.3917411416769028, |
|
"reward_std": 0.5849474132061004, |
|
"rewards/accuracy_reward": 0.556919663399458, |
|
"rewards/format_reward": 0.8348214626312256, |
|
"step": 145 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 458.2018081665039, |
|
"epoch": 2.579957356076759, |
|
"grad_norm": 1305.534423828125, |
|
"kl": 4.1134765625, |
|
"learning_rate": 1.0106958161686963e-06, |
|
"loss": 0.1285, |
|
"reward": 1.340178629755974, |
|
"reward_std": 0.6000380210578442, |
|
"rewards/accuracy_reward": 0.5174107395112515, |
|
"rewards/format_reward": 0.8227678969502449, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 452.27725372314455, |
|
"epoch": 2.6652452025586353, |
|
"grad_norm": 481.3914794921875, |
|
"kl": 2.22734375, |
|
"learning_rate": 9.051099682520474e-07, |
|
"loss": 0.039, |
|
"reward": 1.3504464954137803, |
|
"reward_std": 0.613651292026043, |
|
"rewards/accuracy_reward": 0.5243303805589676, |
|
"rewards/format_reward": 0.826116107404232, |
|
"step": 155 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 452.8314926147461, |
|
"epoch": 2.750533049040512, |
|
"grad_norm": 59.583213806152344, |
|
"kl": 2.50859375, |
|
"learning_rate": 8.029152419343472e-07, |
|
"loss": 0.058, |
|
"reward": 1.3649554133415223, |
|
"reward_std": 0.6238026581704617, |
|
"rewards/accuracy_reward": 0.544419664889574, |
|
"rewards/format_reward": 0.8205357521772385, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 455.3404197692871, |
|
"epoch": 2.835820895522388, |
|
"grad_norm": 1071.0487060546875, |
|
"kl": 2.58671875, |
|
"learning_rate": 7.046941898307347e-07, |
|
"loss": 0.076, |
|
"reward": 1.340848270058632, |
|
"reward_std": 0.6379699215292931, |
|
"rewards/accuracy_reward": 0.5301339507102967, |
|
"rewards/format_reward": 0.8107143223285675, |
|
"step": 165 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 461.10560073852537, |
|
"epoch": 2.9211087420042645, |
|
"grad_norm": 869.7614135742188, |
|
"kl": 2.18603515625, |
|
"learning_rate": 6.11006712953975e-07, |
|
"loss": 0.06, |
|
"reward": 1.3625000655651092, |
|
"reward_std": 0.605516166985035, |
|
"rewards/accuracy_reward": 0.5419643118977546, |
|
"rewards/format_reward": 0.8205357491970062, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 465.46427154541016, |
|
"epoch": 3.0170575692963753, |
|
"grad_norm": 37.821475982666016, |
|
"kl": 3.72734375, |
|
"learning_rate": 5.223868690448817e-07, |
|
"loss": 0.0881, |
|
"reward": 1.3287946999073028, |
|
"reward_std": 0.6144991792738438, |
|
"rewards/accuracy_reward": 0.5147321581840515, |
|
"rewards/format_reward": 0.8140625327825546, |
|
"step": 175 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 473.2102905273438, |
|
"epoch": 3.1023454157782515, |
|
"grad_norm": 2058.541015625, |
|
"kl": 3.3796875, |
|
"learning_rate": 4.3933982822017883e-07, |
|
"loss": 0.108, |
|
"reward": 1.3754464894533158, |
|
"reward_std": 0.6123204372823239, |
|
"rewards/accuracy_reward": 0.5506696686148643, |
|
"rewards/format_reward": 0.8247768208384514, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 475.0154228210449, |
|
"epoch": 3.1876332622601278, |
|
"grad_norm": 267681.28125, |
|
"kl": 63.9951171875, |
|
"learning_rate": 3.6233899329188115e-07, |
|
"loss": 3.57, |
|
"reward": 1.3428572058677672, |
|
"reward_std": 0.6201678223907947, |
|
"rewards/accuracy_reward": 0.5258928827941418, |
|
"rewards/format_reward": 0.8169643193483352, |
|
"step": 185 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 471.1038162231445, |
|
"epoch": 3.272921108742004, |
|
"grad_norm": 2452.4267578125, |
|
"kl": 32.8134765625, |
|
"learning_rate": 2.9182330117358096e-07, |
|
"loss": 1.8761, |
|
"reward": 1.3464286357164383, |
|
"reward_std": 0.6306888595223427, |
|
"rewards/accuracy_reward": 0.5354910954833031, |
|
"rewards/format_reward": 0.810937537252903, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 461.9361801147461, |
|
"epoch": 3.3582089552238807, |
|
"grad_norm": 938.4805908203125, |
|
"kl": 4.35234375, |
|
"learning_rate": 2.281947207567473e-07, |
|
"loss": 0.131, |
|
"reward": 1.3305804163217545, |
|
"reward_std": 0.6262686759233475, |
|
"rewards/accuracy_reward": 0.5127232395112514, |
|
"rewards/format_reward": 0.8178571835160255, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 3.443496801705757, |
|
"grad_norm": 1083.9442138671875, |
|
"learning_rate": 1.718159615201853e-07, |
|
"loss": 0.0121, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.443496801705757, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 467.76792581089006, |
|
"eval_kl": 2.3580770766773163, |
|
"eval_loss": 0.06606976687908173, |
|
"eval_reward": 1.3115872357980893, |
|
"eval_reward_std": 0.6067914157248915, |
|
"eval_rewards/accuracy_reward": 0.492383637367346, |
|
"eval_rewards/format_reward": 0.8192035969073018, |
|
"eval_runtime": 4249.3378, |
|
"eval_samples_per_second": 1.177, |
|
"eval_steps_per_second": 0.011, |
|
"step": 200 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 464.8625213623047, |
|
"epoch": 3.5287846481876333, |
|
"grad_norm": 41.21245193481445, |
|
"kl": 2.200341796875, |
|
"learning_rate": 1.2300840593454622e-07, |
|
"loss": 0.076, |
|
"reward": 1.3582589879631997, |
|
"reward_std": 0.6224493138492108, |
|
"rewards/accuracy_reward": 0.5436384178698063, |
|
"rewards/format_reward": 0.8146205730736256, |
|
"step": 205 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 465.8446632385254, |
|
"epoch": 3.6140724946695095, |
|
"grad_norm": 418.15625, |
|
"kl": 2.2728515625, |
|
"learning_rate": 8.20502774480395e-08, |
|
"loss": 0.0404, |
|
"reward": 1.377901840209961, |
|
"reward_std": 0.6090469680726528, |
|
"rewards/accuracy_reward": 0.5529018096625805, |
|
"rewards/format_reward": 0.8250000357627869, |
|
"step": 210 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 464.74443969726565, |
|
"epoch": 3.699360341151386, |
|
"grad_norm": 315.2561340332031, |
|
"kl": 2.88203125, |
|
"learning_rate": 4.917505449659615e-08, |
|
"loss": 0.0884, |
|
"reward": 1.368080422282219, |
|
"reward_std": 0.6142228864133358, |
|
"rewards/accuracy_reward": 0.5479910932481289, |
|
"rewards/format_reward": 0.8200893253087997, |
|
"step": 215 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 467.38953018188477, |
|
"epoch": 3.7846481876332625, |
|
"grad_norm": 156.9326171875, |
|
"kl": 2.8296875, |
|
"learning_rate": 2.4570139579284723e-08, |
|
"loss": 0.0658, |
|
"reward": 1.3386161386966706, |
|
"reward_std": 0.6271648786962032, |
|
"rewards/accuracy_reward": 0.5265625193715096, |
|
"rewards/format_reward": 0.8120536088943482, |
|
"step": 220 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 462.0622955322266, |
|
"epoch": 3.8699360341151388, |
|
"grad_norm": 81.24732208251953, |
|
"kl": 2.6775390625, |
|
"learning_rate": 8.37579098581176e-09, |
|
"loss": 0.0865, |
|
"reward": 1.3390625447034836, |
|
"reward_std": 0.6189317315816879, |
|
"rewards/accuracy_reward": 0.5323660977184772, |
|
"rewards/format_reward": 0.8066964671015739, |
|
"step": 225 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 470.36943969726565, |
|
"epoch": 3.955223880597015, |
|
"grad_norm": 553.80859375, |
|
"kl": 2.40078125, |
|
"learning_rate": 6.843232656998933e-10, |
|
"loss": 0.053, |
|
"reward": 1.3424107700586319, |
|
"reward_std": 0.6173162661492825, |
|
"rewards/accuracy_reward": 0.5263393089175225, |
|
"rewards/format_reward": 0.8160714641213417, |
|
"step": 230 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 486.319974899292, |
|
"epoch": 3.9893390191897655, |
|
"kl": 2.2841796875, |
|
"reward": 1.3577009439468384, |
|
"reward_std": 0.5995111986994743, |
|
"rewards/accuracy_reward": 0.5401785913854837, |
|
"rewards/format_reward": 0.8175223655998707, |
|
"step": 232, |
|
"total_flos": 0.0, |
|
"train_loss": 2.3046215271270145, |
|
"train_runtime": 39674.2598, |
|
"train_samples_per_second": 0.756, |
|
"train_steps_per_second": 0.006 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 232, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|