diff --git "a/checkpoint-1500/trainer_state.json" "b/checkpoint-1500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1500/trainer_state.json" @@ -0,0 +1,2763 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.3776271965085563, + "eval_steps": 50, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00918800964741013, + "grad_norm": 0.036612071096897125, + "learning_rate": 4.999451708687114e-06, + "logits/chosen": 15.01579761505127, + "logits/rejected": 15.359031677246094, + "logps/chosen": -0.2681262791156769, + "logps/rejected": -0.31947994232177734, + "loss": 0.9551, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.40218934416770935, + "rewards/margins": 0.07703053951263428, + "rewards/rejected": -0.479219913482666, + "step": 10 + }, + { + "epoch": 0.01837601929482026, + "grad_norm": 0.05575725808739662, + "learning_rate": 4.997807075247147e-06, + "logits/chosen": 14.570712089538574, + "logits/rejected": 15.321355819702148, + "logps/chosen": -0.2867889404296875, + "logps/rejected": -0.3514837622642517, + "loss": 0.923, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.43018341064453125, + "rewards/margins": 0.09704220294952393, + "rewards/rejected": -0.5272256135940552, + "step": 20 + }, + { + "epoch": 0.02756402894223039, + "grad_norm": 0.0492466576397419, + "learning_rate": 4.9950668210706795e-06, + "logits/chosen": 14.748420715332031, + "logits/rejected": 14.969354629516602, + "logps/chosen": -0.28405922651290894, + "logps/rejected": -0.32855403423309326, + "loss": 0.9357, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.426088809967041, + "rewards/margins": 0.06674225628376007, + "rewards/rejected": -0.4928310811519623, + "step": 30 + }, + { + "epoch": 0.03675203858964052, + "grad_norm": 0.05719422921538353, + "learning_rate": 4.9912321481237616e-06, + "logits/chosen": 14.28278923034668, + "logits/rejected": 14.76964282989502, + "logps/chosen": -0.27940627932548523, + "logps/rejected": -0.3408831059932709, + "loss": 0.9215, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.41910940408706665, + "rewards/margins": 0.09221524000167847, + "rewards/rejected": -0.5113246440887451, + "step": 40 + }, + { + "epoch": 0.04594004823705065, + "grad_norm": 0.06247895210981369, + "learning_rate": 4.986304738420684e-06, + "logits/chosen": 14.943578720092773, + "logits/rejected": 14.936178207397461, + "logps/chosen": -0.2819541394710541, + "logps/rejected": -0.3245392441749573, + "loss": 0.9464, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.4229312539100647, + "rewards/margins": 0.06387762725353241, + "rewards/rejected": -0.4868088662624359, + "step": 50 + }, + { + "epoch": 0.04594004823705065, + "eval_logits/chosen": 14.7594575881958, + "eval_logits/rejected": 15.193694114685059, + "eval_logps/chosen": -0.2807807922363281, + "eval_logps/rejected": -0.36209535598754883, + "eval_loss": 0.9397181868553162, + "eval_rewards/accuracies": 0.5681818127632141, + "eval_rewards/chosen": -0.4211711883544922, + "eval_rewards/margins": 0.12197184562683105, + "eval_rewards/rejected": -0.5431429743766785, + "eval_runtime": 24.9762, + "eval_samples_per_second": 28.187, + "eval_steps_per_second": 3.523, + "step": 50 + }, + { + "epoch": 0.05512805788446078, + "grad_norm": 0.11519577354192734, + "learning_rate": 4.980286753286196e-06, + "logits/chosen": 14.996228218078613, + "logits/rejected": 15.37781810760498, + "logps/chosen": -0.2809831202030182, + "logps/rejected": -0.35486167669296265, + "loss": 0.9318, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4214746952056885, + "rewards/margins": 0.1108178049325943, + "rewards/rejected": -0.5322924852371216, + "step": 60 + }, + { + "epoch": 0.06431606753187091, + "grad_norm": 0.06691388040781021, + "learning_rate": 4.973180832407471e-06, + "logits/chosen": 14.612454414367676, + "logits/rejected": 15.678136825561523, + "logps/chosen": -0.2569667100906372, + "logps/rejected": -0.40047627687454224, + "loss": 0.9158, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3854501247406006, + "rewards/margins": 0.21526429057121277, + "rewards/rejected": -0.600714385509491, + "step": 70 + }, + { + "epoch": 0.07350407717928104, + "grad_norm": 0.05976058170199394, + "learning_rate": 4.964990092676263e-06, + "logits/chosen": 14.873895645141602, + "logits/rejected": 15.50474739074707, + "logps/chosen": -0.28742527961730957, + "logps/rejected": -0.37555089592933655, + "loss": 0.9372, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.43113788962364197, + "rewards/margins": 0.13218846917152405, + "rewards/rejected": -0.5633264183998108, + "step": 80 + }, + { + "epoch": 0.08269208682669117, + "grad_norm": 0.0602131113409996, + "learning_rate": 4.9557181268217225e-06, + "logits/chosen": 14.356691360473633, + "logits/rejected": 14.895658493041992, + "logps/chosen": -0.2613506317138672, + "logps/rejected": -0.3317110538482666, + "loss": 0.9324, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3920259475708008, + "rewards/margins": 0.10554064810276031, + "rewards/rejected": -0.4975665509700775, + "step": 90 + }, + { + "epoch": 0.0918800964741013, + "grad_norm": 0.07126503437757492, + "learning_rate": 4.9453690018345144e-06, + "logits/chosen": 14.862826347351074, + "logits/rejected": 15.257089614868164, + "logps/chosen": -0.2707213759422302, + "logps/rejected": -0.3511395752429962, + "loss": 0.9353, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.4060820937156677, + "rewards/margins": 0.1206272691488266, + "rewards/rejected": -0.5267094373703003, + "step": 100 + }, + { + "epoch": 0.0918800964741013, + "eval_logits/chosen": 14.664334297180176, + "eval_logits/rejected": 15.113536834716797, + "eval_logps/chosen": -0.2750833034515381, + "eval_logps/rejected": -0.36540210247039795, + "eval_loss": 0.9324077367782593, + "eval_rewards/accuracies": 0.5795454382896423, + "eval_rewards/chosen": -0.41262495517730713, + "eval_rewards/margins": 0.1354781985282898, + "eval_rewards/rejected": -0.5481031537055969, + "eval_runtime": 24.4286, + "eval_samples_per_second": 28.819, + "eval_steps_per_second": 3.602, + "step": 100 + }, + { + "epoch": 0.10106810612151143, + "grad_norm": 0.07136944681406021, + "learning_rate": 4.933947257182901e-06, + "logits/chosen": 14.942098617553711, + "logits/rejected": 15.138586044311523, + "logps/chosen": -0.2860812246799469, + "logps/rejected": -0.36259371042251587, + "loss": 0.934, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.42912182211875916, + "rewards/margins": 0.11476878076791763, + "rewards/rejected": -0.5438905954360962, + "step": 110 + }, + { + "epoch": 0.11025611576892155, + "grad_norm": 0.07038908451795578, + "learning_rate": 4.921457902821578e-06, + "logits/chosen": 14.488851547241211, + "logits/rejected": 14.702054023742676, + "logps/chosen": -0.2662215232849121, + "logps/rejected": -0.3013685941696167, + "loss": 0.9202, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.39933228492736816, + "rewards/margins": 0.05272058770060539, + "rewards/rejected": -0.45205289125442505, + "step": 120 + }, + { + "epoch": 0.11944412541633169, + "grad_norm": 0.06875801086425781, + "learning_rate": 4.907906416994146e-06, + "logits/chosen": 14.075657844543457, + "logits/rejected": 14.696513175964355, + "logps/chosen": -0.250360369682312, + "logps/rejected": -0.3504650592803955, + "loss": 0.9266, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.375540554523468, + "rewards/margins": 0.15015706419944763, + "rewards/rejected": -0.5256975889205933, + "step": 130 + }, + { + "epoch": 0.12863213506374183, + "grad_norm": 0.0984601378440857, + "learning_rate": 4.893298743830168e-06, + "logits/chosen": 13.738212585449219, + "logits/rejected": 14.311574935913086, + "logps/chosen": -0.26711025834083557, + "logps/rejected": -0.3587702810764313, + "loss": 0.9185, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.40066537261009216, + "rewards/margins": 0.13749003410339355, + "rewards/rejected": -0.5381554365158081, + "step": 140 + }, + { + "epoch": 0.13782014471115195, + "grad_norm": 0.10201425850391388, + "learning_rate": 4.8776412907378845e-06, + "logits/chosen": 13.7462797164917, + "logits/rejected": 14.230626106262207, + "logps/chosen": -0.25559619069099426, + "logps/rejected": -0.3708702623844147, + "loss": 0.9106, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.3833943009376526, + "rewards/margins": 0.17291104793548584, + "rewards/rejected": -0.5563054084777832, + "step": 150 + }, + { + "epoch": 0.13782014471115195, + "eval_logits/chosen": 13.458538055419922, + "eval_logits/rejected": 13.998083114624023, + "eval_logps/chosen": -0.2759075462818146, + "eval_logps/rejected": -0.3873325288295746, + "eval_loss": 0.9164085388183594, + "eval_rewards/accuracies": 0.5795454382896423, + "eval_rewards/chosen": -0.41386130452156067, + "eval_rewards/margins": 0.1671374887228012, + "eval_rewards/rejected": -0.5809988379478455, + "eval_runtime": 24.4393, + "eval_samples_per_second": 28.806, + "eval_steps_per_second": 3.601, + "step": 150 + }, + { + "epoch": 0.14700815435856207, + "grad_norm": 0.11537656933069229, + "learning_rate": 4.860940925593703e-06, + "logits/chosen": 12.686149597167969, + "logits/rejected": 13.478736877441406, + "logps/chosen": -0.23941929638385773, + "logps/rejected": -0.3713286519050598, + "loss": 0.9094, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3591288924217224, + "rewards/margins": 0.1978640854358673, + "rewards/rejected": -0.5569929480552673, + "step": 160 + }, + { + "epoch": 0.1561961640059722, + "grad_norm": 0.1196313351392746, + "learning_rate": 4.84320497372973e-06, + "logits/chosen": 13.221656799316406, + "logits/rejected": 13.317082405090332, + "logps/chosen": -0.3033878207206726, + "logps/rejected": -0.3784424960613251, + "loss": 0.9057, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.4550817608833313, + "rewards/margins": 0.11258199065923691, + "rewards/rejected": -0.5676637887954712, + "step": 170 + }, + { + "epoch": 0.16538417365338234, + "grad_norm": 0.18745549023151398, + "learning_rate": 4.824441214720629e-06, + "logits/chosen": 11.797627449035645, + "logits/rejected": 12.031414985656738, + "logps/chosen": -0.2746419608592987, + "logps/rejected": -0.3629845976829529, + "loss": 0.8954, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.41196292638778687, + "rewards/margins": 0.13251398503780365, + "rewards/rejected": -0.5444768667221069, + "step": 180 + }, + { + "epoch": 0.17457218330079247, + "grad_norm": 0.1806156188249588, + "learning_rate": 4.804657878971252e-06, + "logits/chosen": 10.275301933288574, + "logits/rejected": 10.937273025512695, + "logps/chosen": -0.2880379557609558, + "logps/rejected": -0.4154580533504486, + "loss": 0.8875, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.43205690383911133, + "rewards/margins": 0.19113019108772278, + "rewards/rejected": -0.6231871247291565, + "step": 190 + }, + { + "epoch": 0.1837601929482026, + "grad_norm": 0.1839464157819748, + "learning_rate": 4.783863644106502e-06, + "logits/chosen": 10.020039558410645, + "logits/rejected": 10.66059398651123, + "logps/chosen": -0.3136019706726074, + "logps/rejected": -0.4385503828525543, + "loss": 0.8647, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.47040295600891113, + "rewards/margins": 0.18742261826992035, + "rewards/rejected": -0.6578255891799927, + "step": 200 + }, + { + "epoch": 0.1837601929482026, + "eval_logits/chosen": 9.442557334899902, + "eval_logits/rejected": 10.053345680236816, + "eval_logps/chosen": -0.3080674409866333, + "eval_logps/rejected": -0.4899139702320099, + "eval_loss": 0.8702690005302429, + "eval_rewards/accuracies": 0.6931818127632141, + "eval_rewards/chosen": -0.46210116147994995, + "eval_rewards/margins": 0.27276986837387085, + "eval_rewards/rejected": -0.7348710894584656, + "eval_runtime": 24.4185, + "eval_samples_per_second": 28.831, + "eval_steps_per_second": 3.604, + "step": 200 + }, + { + "epoch": 0.19294820259561274, + "grad_norm": 0.269613116979599, + "learning_rate": 4.762067631165049e-06, + "logits/chosen": 7.941342353820801, + "logits/rejected": 8.542920112609863, + "logps/chosen": -0.3083941638469696, + "logps/rejected": -0.5024437308311462, + "loss": 0.8471, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.4625912606716156, + "rewards/margins": 0.29107433557510376, + "rewards/rejected": -0.7536656856536865, + "step": 210 + }, + { + "epoch": 0.20213621224302286, + "grad_norm": 0.2640094459056854, + "learning_rate": 4.7392794005985324e-06, + "logits/chosen": 7.587499141693115, + "logits/rejected": 7.592519283294678, + "logps/chosen": -0.3381899893283844, + "logps/rejected": -0.48494213819503784, + "loss": 0.8427, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.5072849988937378, + "rewards/margins": 0.22012826800346375, + "rewards/rejected": -0.7274132966995239, + "step": 220 + }, + { + "epoch": 0.21132422189043298, + "grad_norm": 0.29708293080329895, + "learning_rate": 4.715508948078037e-06, + "logits/chosen": 6.250656604766846, + "logits/rejected": 6.7652716636657715, + "logps/chosen": -0.3644888997077942, + "logps/rejected": -0.5470594167709351, + "loss": 0.8201, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5467333793640137, + "rewards/margins": 0.2738557755947113, + "rewards/rejected": -0.8205891847610474, + "step": 230 + }, + { + "epoch": 0.2205122315378431, + "grad_norm": 0.35299497842788696, + "learning_rate": 4.690766700109659e-06, + "logits/chosen": 4.6331706047058105, + "logits/rejected": 4.710076332092285, + "logps/chosen": -0.3634452223777771, + "logps/rejected": -0.7193974256515503, + "loss": 0.7877, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.545167863368988, + "rewards/margins": 0.5339283347129822, + "rewards/rejected": -1.0790963172912598, + "step": 240 + }, + { + "epoch": 0.22970024118525326, + "grad_norm": 0.4265730082988739, + "learning_rate": 4.665063509461098e-06, + "logits/chosen": 4.992984771728516, + "logits/rejected": 4.606354713439941, + "logps/chosen": -0.413116455078125, + "logps/rejected": -0.7104976177215576, + "loss": 0.7902, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6196746826171875, + "rewards/margins": 0.4460717737674713, + "rewards/rejected": -1.0657463073730469, + "step": 250 + }, + { + "epoch": 0.22970024118525326, + "eval_logits/chosen": 4.127804279327393, + "eval_logits/rejected": 3.742251396179199, + "eval_logps/chosen": -0.420327365398407, + "eval_logps/rejected": -0.7902651429176331, + "eval_loss": 0.7682384252548218, + "eval_rewards/accuracies": 0.7159090638160706, + "eval_rewards/chosen": -0.6304910182952881, + "eval_rewards/margins": 0.5549066662788391, + "eval_rewards/rejected": -1.185397744178772, + "eval_runtime": 24.4318, + "eval_samples_per_second": 28.815, + "eval_steps_per_second": 3.602, + "step": 250 + }, + { + "epoch": 0.23888825083266338, + "grad_norm": 0.7236106395721436, + "learning_rate": 4.638410650401267e-06, + "logits/chosen": 2.454423427581787, + "logits/rejected": 1.816563367843628, + "logps/chosen": -0.4492695927619934, + "logps/rejected": -0.8738088607788086, + "loss": 0.6911, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6739044189453125, + "rewards/margins": 0.6368088126182556, + "rewards/rejected": -1.3107131719589233, + "step": 260 + }, + { + "epoch": 0.2480762604800735, + "grad_norm": 0.5856125950813293, + "learning_rate": 4.610819813755038e-06, + "logits/chosen": 3.2105612754821777, + "logits/rejected": 2.5531132221221924, + "logps/chosen": -0.537078320980072, + "logps/rejected": -1.2025481462478638, + "loss": 0.6774, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.80561763048172, + "rewards/margins": 0.9982045888900757, + "rewards/rejected": -1.8038222789764404, + "step": 270 + }, + { + "epoch": 0.25726427012748365, + "grad_norm": 0.7396731972694397, + "learning_rate": 4.582303101775249e-06, + "logits/chosen": 2.0327231884002686, + "logits/rejected": 1.4601097106933594, + "logps/chosen": -0.47658151388168335, + "logps/rejected": -1.3808696269989014, + "loss": 0.628, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7148722410202026, + "rewards/margins": 1.3564319610595703, + "rewards/rejected": -2.0713045597076416, + "step": 280 + }, + { + "epoch": 0.2664522797748938, + "grad_norm": 2.412203550338745, + "learning_rate": 4.55287302283426e-06, + "logits/chosen": 1.2262591123580933, + "logits/rejected": 0.22599482536315918, + "logps/chosen": -0.5671601891517639, + "logps/rejected": -1.6760343313217163, + "loss": 0.5988, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8507402539253235, + "rewards/margins": 1.6633113622665405, + "rewards/rejected": -2.5140514373779297, + "step": 290 + }, + { + "epoch": 0.2756402894223039, + "grad_norm": 1.0477895736694336, + "learning_rate": 4.522542485937369e-06, + "logits/chosen": 1.9952911138534546, + "logits/rejected": 1.1298446655273438, + "logps/chosen": -0.6152974367141724, + "logps/rejected": -2.128481388092041, + "loss": 0.5927, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9229460954666138, + "rewards/margins": 2.2697763442993164, + "rewards/rejected": -3.192722797393799, + "step": 300 + }, + { + "epoch": 0.2756402894223039, + "eval_logits/chosen": 1.6342910528182983, + "eval_logits/rejected": 0.633538007736206, + "eval_logps/chosen": -0.606099545955658, + "eval_logps/rejected": -1.882785439491272, + "eval_loss": 0.5978505611419678, + "eval_rewards/accuracies": 0.7159090638160706, + "eval_rewards/chosen": -0.909149169921875, + "eval_rewards/margins": 1.9150291681289673, + "eval_rewards/rejected": -2.824178457260132, + "eval_runtime": 24.4299, + "eval_samples_per_second": 28.817, + "eval_steps_per_second": 3.602, + "step": 300 + }, + { + "epoch": 0.284828299069714, + "grad_norm": 3.0767388343811035, + "learning_rate": 4.491324795060491e-06, + "logits/chosen": 1.0374902486801147, + "logits/rejected": 0.6220051646232605, + "logps/chosen": -0.6778531074523926, + "logps/rejected": -1.8290255069732666, + "loss": 0.5792, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0167796611785889, + "rewards/margins": 1.726758599281311, + "rewards/rejected": -2.7435383796691895, + "step": 310 + }, + { + "epoch": 0.29401630871712414, + "grad_norm": 0.6015120148658752, + "learning_rate": 4.4592336433146e-06, + "logits/chosen": 1.0050956010818481, + "logits/rejected": -0.016118621453642845, + "logps/chosen": -0.6865260004997253, + "logps/rejected": -2.113417148590088, + "loss": 0.5384, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.0297890901565552, + "rewards/margins": 2.140336513519287, + "rewards/rejected": -3.1701254844665527, + "step": 320 + }, + { + "epoch": 0.30320431836453426, + "grad_norm": 0.7415631413459778, + "learning_rate": 4.426283106939474e-06, + "logits/chosen": 0.5536144971847534, + "logits/rejected": -0.1644023358821869, + "logps/chosen": -0.8181726336479187, + "logps/rejected": -2.581185817718506, + "loss": 0.5671, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2272589206695557, + "rewards/margins": 2.644519805908203, + "rewards/rejected": -3.871778964996338, + "step": 330 + }, + { + "epoch": 0.3123923280119444, + "grad_norm": 0.8956871628761292, + "learning_rate": 4.3924876391293915e-06, + "logits/chosen": 1.6062015295028687, + "logits/rejected": 0.9243733286857605, + "logps/chosen": -0.8991573452949524, + "logps/rejected": -2.935060977935791, + "loss": 0.5124, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.348736047744751, + "rewards/margins": 3.0538551807403564, + "rewards/rejected": -4.402591228485107, + "step": 340 + }, + { + "epoch": 0.32158033765935456, + "grad_norm": 1.1822264194488525, + "learning_rate": 4.357862063693486e-06, + "logits/chosen": 1.8171085119247437, + "logits/rejected": 1.228049397468567, + "logps/chosen": -0.8822734951972961, + "logps/rejected": -2.4744174480438232, + "loss": 0.516, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3234103918075562, + "rewards/margins": 2.388216018676758, + "rewards/rejected": -3.7116265296936035, + "step": 350 + }, + { + "epoch": 0.32158033765935456, + "eval_logits/chosen": 1.1368330717086792, + "eval_logits/rejected": 0.2795785665512085, + "eval_logps/chosen": -0.9485350251197815, + "eval_logps/rejected": -2.6484899520874023, + "eval_loss": 0.5133901238441467, + "eval_rewards/accuracies": 0.7727272510528564, + "eval_rewards/chosen": -1.4228025674819946, + "eval_rewards/margins": 2.5499324798583984, + "eval_rewards/rejected": -3.9727354049682617, + "eval_runtime": 24.4277, + "eval_samples_per_second": 28.82, + "eval_steps_per_second": 3.602, + "step": 350 + }, + { + "epoch": 0.3307683473067647, + "grad_norm": 2.5278775691986084, + "learning_rate": 4.322421568553529e-06, + "logits/chosen": 1.1921958923339844, + "logits/rejected": 0.7565670013427734, + "logps/chosen": -1.4180412292480469, + "logps/rejected": -3.0890870094299316, + "loss": 0.4811, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.1270618438720703, + "rewards/margins": 2.5065689086914062, + "rewards/rejected": -4.633630752563477, + "step": 360 + }, + { + "epoch": 0.3399563569541748, + "grad_norm": 1.7325788736343384, + "learning_rate": 4.286181699082008e-06, + "logits/chosen": 0.997096836566925, + "logits/rejected": 0.4399908483028412, + "logps/chosen": -1.9010308980941772, + "logps/rejected": -3.6025185585021973, + "loss": 0.4326, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.851546287536621, + "rewards/margins": 2.552231550216675, + "rewards/rejected": -5.403778076171875, + "step": 370 + }, + { + "epoch": 0.34914436660158493, + "grad_norm": 4.608370304107666, + "learning_rate": 4.249158351283414e-06, + "logits/chosen": 1.0715999603271484, + "logits/rejected": 0.6113725900650024, + "logps/chosen": -2.4032845497131348, + "logps/rejected": -3.9940528869628906, + "loss": 0.4027, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.6049270629882812, + "rewards/margins": 2.3861520290374756, + "rewards/rejected": -5.991078853607178, + "step": 380 + }, + { + "epoch": 0.35833237624899505, + "grad_norm": 4.600816249847412, + "learning_rate": 4.211367764821722e-06, + "logits/chosen": 1.4443682432174683, + "logits/rejected": 0.8617011904716492, + "logps/chosen": -2.349093198776245, + "logps/rejected": -3.9943203926086426, + "loss": 0.4118, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.5236401557922363, + "rewards/margins": 2.467839479446411, + "rewards/rejected": -5.991480350494385, + "step": 390 + }, + { + "epoch": 0.3675203858964052, + "grad_norm": 2.1458094120025635, + "learning_rate": 4.172826515897146e-06, + "logits/chosen": 1.3029582500457764, + "logits/rejected": 0.7705980539321899, + "logps/chosen": -2.192617416381836, + "logps/rejected": -3.8413078784942627, + "loss": 0.3557, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.288925886154175, + "rewards/margins": 2.4730350971221924, + "rewards/rejected": -5.761960506439209, + "step": 400 + }, + { + "epoch": 0.3675203858964052, + "eval_logits/chosen": 0.8742353320121765, + "eval_logits/rejected": 0.14320053160190582, + "eval_logps/chosen": -2.0894505977630615, + "eval_logps/rejected": -4.20783805847168, + "eval_loss": 0.4082850515842438, + "eval_rewards/accuracies": 0.8863636255264282, + "eval_rewards/chosen": -3.1341757774353027, + "eval_rewards/margins": 3.1775810718536377, + "eval_rewards/rejected": -6.311756610870361, + "eval_runtime": 24.4316, + "eval_samples_per_second": 28.815, + "eval_steps_per_second": 3.602, + "step": 400 + }, + { + "epoch": 0.3767083955438153, + "grad_norm": 2.5496387481689453, + "learning_rate": 4.133551509975264e-06, + "logits/chosen": 1.9024279117584229, + "logits/rejected": 1.4777928590774536, + "logps/chosen": -2.7541470527648926, + "logps/rejected": -4.542642116546631, + "loss": 0.4184, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.13122034072876, + "rewards/margins": 2.6827428340911865, + "rewards/rejected": -6.813962459564209, + "step": 410 + }, + { + "epoch": 0.3858964051912255, + "grad_norm": 2.6187174320220947, + "learning_rate": 4.093559974371725e-06, + "logits/chosen": 1.9345887899398804, + "logits/rejected": 1.4047685861587524, + "logps/chosen": -2.389430522918701, + "logps/rejected": -4.439882755279541, + "loss": 0.4096, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.5841457843780518, + "rewards/margins": 3.0756776332855225, + "rewards/rejected": -6.659823417663574, + "step": 420 + }, + { + "epoch": 0.3950844148386356, + "grad_norm": 3.762899398803711, + "learning_rate": 4.052869450695776e-06, + "logits/chosen": 1.2521915435791016, + "logits/rejected": 0.7238092422485352, + "logps/chosen": -2.5571534633636475, + "logps/rejected": -4.675185203552246, + "loss": 0.4083, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.835730791091919, + "rewards/margins": 3.1770474910736084, + "rewards/rejected": -7.012777805328369, + "step": 430 + }, + { + "epoch": 0.4042724244860457, + "grad_norm": 3.2343404293060303, + "learning_rate": 4.011497787155938e-06, + "logits/chosen": 1.9626567363739014, + "logits/rejected": 1.3136894702911377, + "logps/chosen": -2.892967939376831, + "logps/rejected": -5.003688335418701, + "loss": 0.3783, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.339452266693115, + "rewards/margins": 3.1660804748535156, + "rewards/rejected": -7.505532264709473, + "step": 440 + }, + { + "epoch": 0.41346043413345585, + "grad_norm": 3.2979824542999268, + "learning_rate": 3.969463130731183e-06, + "logits/chosen": 1.159234881401062, + "logits/rejected": 0.5396692752838135, + "logps/chosen": -2.7545204162597656, + "logps/rejected": -5.4179182052612305, + "loss": 0.347, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.131781101226807, + "rewards/margins": 3.9950966835021973, + "rewards/rejected": -8.126876831054688, + "step": 450 + }, + { + "epoch": 0.41346043413345585, + "eval_logits/chosen": 0.9304068088531494, + "eval_logits/rejected": 0.30015668272972107, + "eval_logps/chosen": -2.485308885574341, + "eval_logps/rejected": -4.880238056182861, + "eval_loss": 0.36501914262771606, + "eval_rewards/accuracies": 0.9204545617103577, + "eval_rewards/chosen": -3.7279627323150635, + "eval_rewards/margins": 3.592393636703491, + "eval_rewards/rejected": -7.320356845855713, + "eval_runtime": 24.4234, + "eval_samples_per_second": 28.825, + "eval_steps_per_second": 3.603, + "step": 450 + }, + { + "epoch": 0.42264844378086597, + "grad_norm": 3.155860185623169, + "learning_rate": 3.92678391921108e-06, + "logits/chosen": 1.633522391319275, + "logits/rejected": 1.0575059652328491, + "logps/chosen": -2.7214303016662598, + "logps/rejected": -4.766176223754883, + "loss": 0.3689, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.082144737243652, + "rewards/margins": 3.0671191215515137, + "rewards/rejected": -7.149264335632324, + "step": 460 + }, + { + "epoch": 0.4318364534282761, + "grad_norm": 2.9949357509613037, + "learning_rate": 3.88347887310836e-06, + "logits/chosen": 1.3139212131500244, + "logits/rejected": 0.5838541388511658, + "logps/chosen": -2.5324313640594482, + "logps/rejected": -4.993292808532715, + "loss": 0.3587, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.7986464500427246, + "rewards/margins": 3.6912925243377686, + "rewards/rejected": -7.489938259124756, + "step": 470 + }, + { + "epoch": 0.4410244630756862, + "grad_norm": 2.4548208713531494, + "learning_rate": 3.839566987447492e-06, + "logits/chosen": 1.2342166900634766, + "logits/rejected": 0.8118699193000793, + "logps/chosen": -2.876863479614258, + "logps/rejected": -5.457588195800781, + "loss": 0.2922, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.315295219421387, + "rewards/margins": 3.871086597442627, + "rewards/rejected": -8.186381340026855, + "step": 480 + }, + { + "epoch": 0.45021247272309634, + "grad_norm": 2.0953762531280518, + "learning_rate": 3.795067523432826e-06, + "logits/chosen": 1.8121936321258545, + "logits/rejected": 1.454637050628662, + "logps/chosen": -3.2022106647491455, + "logps/rejected": -5.47930908203125, + "loss": 0.3086, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.803316116333008, + "rewards/margins": 3.4156479835510254, + "rewards/rejected": -8.218963623046875, + "step": 490 + }, + { + "epoch": 0.4594004823705065, + "grad_norm": 2.4918301105499268, + "learning_rate": 3.7500000000000005e-06, + "logits/chosen": 1.5386875867843628, + "logits/rejected": 1.2544763088226318, + "logps/chosen": -3.2174277305603027, + "logps/rejected": -6.012864112854004, + "loss": 0.3128, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.826140880584717, + "rewards/margins": 4.193154335021973, + "rewards/rejected": -9.019296646118164, + "step": 500 + }, + { + "epoch": 0.4594004823705065, + "eval_logits/chosen": 0.8257808685302734, + "eval_logits/rejected": 0.27475622296333313, + "eval_logps/chosen": -2.755974054336548, + "eval_logps/rejected": -5.469714641571045, + "eval_loss": 0.32884541153907776, + "eval_rewards/accuracies": 0.9204545617103577, + "eval_rewards/chosen": -4.133961200714111, + "eval_rewards/margins": 4.070610523223877, + "eval_rewards/rejected": -8.204572677612305, + "eval_runtime": 24.4191, + "eval_samples_per_second": 28.83, + "eval_steps_per_second": 3.604, + "step": 500 + }, + { + "epoch": 0.46858849201791664, + "grad_norm": 4.017474174499512, + "learning_rate": 3.7043841852542884e-06, + "logits/chosen": 1.8042447566986084, + "logits/rejected": 1.4390740394592285, + "logps/chosen": -3.4311797618865967, + "logps/rejected": -6.114380359649658, + "loss": 0.335, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.1467695236206055, + "rewards/margins": 4.0248003005981445, + "rewards/rejected": -9.171568870544434, + "step": 510 + }, + { + "epoch": 0.47777650166532676, + "grad_norm": 5.516397953033447, + "learning_rate": 3.658240087799655e-06, + "logits/chosen": 1.1141811609268188, + "logits/rejected": 0.7766789197921753, + "logps/chosen": -3.1831612586975098, + "logps/rejected": -5.615653991699219, + "loss": 0.3306, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.774742603302002, + "rewards/margins": 3.6487393379211426, + "rewards/rejected": -8.423480987548828, + "step": 520 + }, + { + "epoch": 0.4869645113127369, + "grad_norm": 4.1005635261535645, + "learning_rate": 3.611587947962319e-06, + "logits/chosen": 1.5796890258789062, + "logits/rejected": 1.1561863422393799, + "logps/chosen": -3.286179304122925, + "logps/rejected": -5.805339813232422, + "loss": 0.3151, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.929268836975098, + "rewards/margins": 3.778740644454956, + "rewards/rejected": -8.708009719848633, + "step": 530 + }, + { + "epoch": 0.496152520960147, + "grad_norm": 2.8616511821746826, + "learning_rate": 3.564448228912682e-06, + "logits/chosen": 0.6956934928894043, + "logits/rejected": 0.1268310248851776, + "logps/chosen": -2.8933937549591064, + "logps/rejected": -5.794272422790527, + "loss": 0.2995, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.340090274810791, + "rewards/margins": 4.351318836212158, + "rewards/rejected": -8.691408157348633, + "step": 540 + }, + { + "epoch": 0.5053405306075571, + "grad_norm": 2.237276315689087, + "learning_rate": 3.516841607689501e-06, + "logits/chosen": 1.7460235357284546, + "logits/rejected": 0.9990445375442505, + "logps/chosen": -2.860546827316284, + "logps/rejected": -5.517810821533203, + "loss": 0.3203, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.290820121765137, + "rewards/margins": 3.985896348953247, + "rewards/rejected": -8.276716232299805, + "step": 550 + }, + { + "epoch": 0.5053405306075571, + "eval_logits/chosen": 0.8703196048736572, + "eval_logits/rejected": 0.3152187466621399, + "eval_logps/chosen": -2.709866523742676, + "eval_logps/rejected": -5.687611103057861, + "eval_loss": 0.304200142621994, + "eval_rewards/accuracies": 0.9204545617103577, + "eval_rewards/chosen": -4.064799785614014, + "eval_rewards/margins": 4.466617107391357, + "eval_rewards/rejected": -8.531416893005371, + "eval_runtime": 24.6083, + "eval_samples_per_second": 28.608, + "eval_steps_per_second": 3.576, + "step": 550 + }, + { + "epoch": 0.5145285402549673, + "grad_norm": 3.7983174324035645, + "learning_rate": 3.4687889661302577e-06, + "logits/chosen": 1.084142804145813, + "logits/rejected": 0.9131366610527039, + "logps/chosen": -3.4958484172821045, + "logps/rejected": -7.143038749694824, + "loss": 0.2728, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.243772506713867, + "rewards/margins": 5.470786094665527, + "rewards/rejected": -10.714558601379395, + "step": 560 + }, + { + "epoch": 0.5237165499023774, + "grad_norm": 2.398188829421997, + "learning_rate": 3.4203113817116955e-06, + "logits/chosen": 1.2777886390686035, + "logits/rejected": 0.6827106475830078, + "logps/chosen": -3.4014134407043457, + "logps/rejected": -6.260494232177734, + "loss": 0.2833, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.102120399475098, + "rewards/margins": 4.288620948791504, + "rewards/rejected": -9.390741348266602, + "step": 570 + }, + { + "epoch": 0.5329045595497875, + "grad_norm": 4.97003173828125, + "learning_rate": 3.3714301183045382e-06, + "logits/chosen": 1.6978384256362915, + "logits/rejected": 1.1092720031738281, + "logps/chosen": -3.179011106491089, + "logps/rejected": -6.802459716796875, + "loss": 0.2942, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.768516540527344, + "rewards/margins": 5.435172080993652, + "rewards/rejected": -10.203688621520996, + "step": 580 + }, + { + "epoch": 0.5420925691971976, + "grad_norm": 4.482264995574951, + "learning_rate": 3.3221666168464584e-06, + "logits/chosen": 1.0245933532714844, + "logits/rejected": 0.7797524929046631, + "logps/chosen": -3.2393958568573, + "logps/rejected": -5.916412353515625, + "loss": 0.3347, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.859094142913818, + "rewards/margins": 4.015524864196777, + "rewards/rejected": -8.874618530273438, + "step": 590 + }, + { + "epoch": 0.5512805788446078, + "grad_norm": 3.913116931915283, + "learning_rate": 3.272542485937369e-06, + "logits/chosen": 1.3320618867874146, + "logits/rejected": 0.9762558937072754, + "logps/chosen": -3.2751450538635254, + "logps/rejected": -6.7808518409729, + "loss": 0.2776, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.912716865539551, + "rewards/margins": 5.258560657501221, + "rewards/rejected": -10.17127799987793, + "step": 600 + }, + { + "epoch": 0.5512805788446078, + "eval_logits/chosen": 0.9448758959770203, + "eval_logits/rejected": 0.42817041277885437, + "eval_logps/chosen": -2.835143804550171, + "eval_logps/rejected": -6.087582588195801, + "eval_loss": 0.28658536076545715, + "eval_rewards/accuracies": 0.9204545617103577, + "eval_rewards/chosen": -4.252715587615967, + "eval_rewards/margins": 4.878659248352051, + "eval_rewards/rejected": -9.13137435913086, + "eval_runtime": 24.4325, + "eval_samples_per_second": 28.814, + "eval_steps_per_second": 3.602, + "step": 600 + }, + { + "epoch": 0.560468588492018, + "grad_norm": 3.8335089683532715, + "learning_rate": 3.222579492361179e-06, + "logits/chosen": 1.295569658279419, + "logits/rejected": 1.0852326154708862, + "logps/chosen": -3.4870052337646484, + "logps/rejected": -6.572986602783203, + "loss": 0.3243, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.230508327484131, + "rewards/margins": 4.628971576690674, + "rewards/rejected": -9.859478950500488, + "step": 610 + }, + { + "epoch": 0.569656598139428, + "grad_norm": 3.095102071762085, + "learning_rate": 3.1722995515381644e-06, + "logits/chosen": 1.8339803218841553, + "logits/rejected": 1.467551589012146, + "logps/chosen": -3.6670470237731934, + "logps/rejected": -6.880410194396973, + "loss": 0.2615, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.500569820404053, + "rewards/margins": 4.820044040679932, + "rewards/rejected": -10.3206148147583, + "step": 620 + }, + { + "epoch": 0.5788446077868382, + "grad_norm": 4.381973743438721, + "learning_rate": 3.121724717912138e-06, + "logits/chosen": 2.390763521194458, + "logits/rejected": 2.155505418777466, + "logps/chosen": -3.3569788932800293, + "logps/rejected": -6.319291114807129, + "loss": 0.2649, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.035468101501465, + "rewards/margins": 4.4434685707092285, + "rewards/rejected": -9.478937149047852, + "step": 630 + }, + { + "epoch": 0.5880326174342483, + "grad_norm": 3.9204964637756348, + "learning_rate": 3.0708771752766397e-06, + "logits/chosen": 1.4343383312225342, + "logits/rejected": 1.1991338729858398, + "logps/chosen": -3.654259443283081, + "logps/rejected": -6.873109340667725, + "loss": 0.2761, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.481389045715332, + "rewards/margins": 4.828273773193359, + "rewards/rejected": -10.309663772583008, + "step": 640 + }, + { + "epoch": 0.5972206270816585, + "grad_norm": 3.179067373275757, + "learning_rate": 3.019779227044398e-06, + "logits/chosen": 2.0368576049804688, + "logits/rejected": 1.8037185668945312, + "logps/chosen": -3.791682720184326, + "logps/rejected": -6.912911415100098, + "loss": 0.2739, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.68752384185791, + "rewards/margins": 4.68184232711792, + "rewards/rejected": -10.369367599487305, + "step": 650 + }, + { + "epoch": 0.5972206270816585, + "eval_logits/chosen": 1.0889618396759033, + "eval_logits/rejected": 0.5887767672538757, + "eval_logps/chosen": -3.0191192626953125, + "eval_logps/rejected": -6.388964653015137, + "eval_loss": 0.27443525195121765, + "eval_rewards/accuracies": 0.9204545617103577, + "eval_rewards/chosen": -4.528679370880127, + "eval_rewards/margins": 5.054768085479736, + "eval_rewards/rejected": -9.583446502685547, + "eval_runtime": 24.4235, + "eval_samples_per_second": 28.825, + "eval_steps_per_second": 3.603, + "step": 650 + }, + { + "epoch": 0.6064086367290685, + "grad_norm": 2.804940700531006, + "learning_rate": 2.9684532864643123e-06, + "logits/chosen": 1.8347485065460205, + "logits/rejected": 1.3636573553085327, + "logps/chosen": -3.398799419403076, + "logps/rejected": -6.337627410888672, + "loss": 0.2678, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.098199367523193, + "rewards/margins": 4.408241271972656, + "rewards/rejected": -9.506441116333008, + "step": 660 + }, + { + "epoch": 0.6155966463764787, + "grad_norm": 3.7368969917297363, + "learning_rate": 2.9169218667902562e-06, + "logits/chosen": 1.6734691858291626, + "logits/rejected": 1.0945308208465576, + "logps/chosen": -3.354790210723877, + "logps/rejected": -6.3691864013671875, + "loss": 0.294, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.0321855545043945, + "rewards/margins": 4.521594524383545, + "rewards/rejected": -9.553780555725098, + "step": 670 + }, + { + "epoch": 0.6247846560238888, + "grad_norm": 4.483130931854248, + "learning_rate": 2.8652075714060296e-06, + "logits/chosen": 1.640228509902954, + "logits/rejected": 1.1519067287445068, + "logps/chosen": -3.4700570106506348, + "logps/rejected": -6.6276445388793945, + "loss": 0.23, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.205085277557373, + "rewards/margins": 4.736380577087402, + "rewards/rejected": -9.941465377807617, + "step": 680 + }, + { + "epoch": 0.633972665671299, + "grad_norm": 8.508203506469727, + "learning_rate": 2.813333083910761e-06, + "logits/chosen": 1.0501906871795654, + "logits/rejected": 0.5691097974777222, + "logps/chosen": -3.4982333183288574, + "logps/rejected": -7.144225120544434, + "loss": 0.2808, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.247350215911865, + "rewards/margins": 5.468987464904785, + "rewards/rejected": -10.716337203979492, + "step": 690 + }, + { + "epoch": 0.6431606753187091, + "grad_norm": 2.271857738494873, + "learning_rate": 2.761321158169134e-06, + "logits/chosen": 1.2119544744491577, + "logits/rejected": 0.7247776389122009, + "logps/chosen": -4.096956253051758, + "logps/rejected": -7.4536261558532715, + "loss": 0.2808, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.145434379577637, + "rewards/margins": 5.035003662109375, + "rewards/rejected": -11.180438995361328, + "step": 700 + }, + { + "epoch": 0.6431606753187091, + "eval_logits/chosen": 1.085008978843689, + "eval_logits/rejected": 0.5991834998130798, + "eval_logps/chosen": -3.179896354675293, + "eval_logps/rejected": -6.738409519195557, + "eval_loss": 0.26070258021354675, + "eval_rewards/accuracies": 0.9090909361839294, + "eval_rewards/chosen": -4.7698445320129395, + "eval_rewards/margins": 5.337769031524658, + "eval_rewards/rejected": -10.107613563537598, + "eval_runtime": 24.465, + "eval_samples_per_second": 28.776, + "eval_steps_per_second": 3.597, + "step": 700 + }, + { + "epoch": 0.6523486849661192, + "grad_norm": 4.058814525604248, + "learning_rate": 2.70919460833079e-06, + "logits/chosen": 1.2182409763336182, + "logits/rejected": 1.0481547117233276, + "logps/chosen": -3.532151699066162, + "logps/rejected": -7.4123854637146, + "loss": 0.2565, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.2982282638549805, + "rewards/margins": 5.8203511238098145, + "rewards/rejected": -11.118578910827637, + "step": 710 + }, + { + "epoch": 0.6615366946135294, + "grad_norm": 3.857574939727783, + "learning_rate": 2.6569762988232838e-06, + "logits/chosen": 0.8797380328178406, + "logits/rejected": 0.6432709693908691, + "logps/chosen": -3.3780128955841064, + "logps/rejected": -6.922214508056641, + "loss": 0.2321, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.067019462585449, + "rewards/margins": 5.3163018226623535, + "rewards/rejected": -10.383320808410645, + "step": 720 + }, + { + "epoch": 0.6707247042609394, + "grad_norm": 3.7125725746154785, + "learning_rate": 2.604689134322999e-06, + "logits/chosen": 1.447474479675293, + "logits/rejected": 1.1114810705184937, + "logps/chosen": -3.682945728302002, + "logps/rejected": -7.297152519226074, + "loss": 0.2812, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.524418830871582, + "rewards/margins": 5.421310901641846, + "rewards/rejected": -10.94572925567627, + "step": 730 + }, + { + "epoch": 0.6799127139083496, + "grad_norm": 2.026604652404785, + "learning_rate": 2.5523560497083927e-06, + "logits/chosen": 1.9674856662750244, + "logits/rejected": 1.593400239944458, + "logps/chosen": -3.3987834453582764, + "logps/rejected": -6.970278263092041, + "loss": 0.2603, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.098175048828125, + "rewards/margins": 5.357242107391357, + "rewards/rejected": -10.45541763305664, + "step": 740 + }, + { + "epoch": 0.6891007235557597, + "grad_norm": 2.357570171356201, + "learning_rate": 2.5e-06, + "logits/chosen": 1.9729163646697998, + "logits/rejected": 1.8791675567626953, + "logps/chosen": -3.3879425525665283, + "logps/rejected": -6.712514400482178, + "loss": 0.2348, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.081913948059082, + "rewards/margins": 4.986858367919922, + "rewards/rejected": -10.068772315979004, + "step": 750 + }, + { + "epoch": 0.6891007235557597, + "eval_logits/chosen": 1.1882920265197754, + "eval_logits/rejected": 0.7092404961585999, + "eval_logps/chosen": -2.855703592300415, + "eval_logps/rejected": -6.557665824890137, + "eval_loss": 0.25226154923439026, + "eval_rewards/accuracies": 0.9090909361839294, + "eval_rewards/chosen": -4.283555507659912, + "eval_rewards/margins": 5.552942276000977, + "eval_rewards/rejected": -9.836498260498047, + "eval_runtime": 24.4236, + "eval_samples_per_second": 28.825, + "eval_steps_per_second": 3.603, + "step": 750 + }, + { + "epoch": 0.6982887332031699, + "grad_norm": 3.1369588375091553, + "learning_rate": 2.447643950291608e-06, + "logits/chosen": 1.908418893814087, + "logits/rejected": 1.637202262878418, + "logps/chosen": -3.941102981567383, + "logps/rejected": -7.562623023986816, + "loss": 0.2412, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.911653995513916, + "rewards/margins": 5.432280540466309, + "rewards/rejected": -11.343935012817383, + "step": 760 + }, + { + "epoch": 0.70747674285058, + "grad_norm": 4.2312703132629395, + "learning_rate": 2.3953108656770018e-06, + "logits/chosen": 1.3125172853469849, + "logits/rejected": 0.933386504650116, + "logps/chosen": -3.5281143188476562, + "logps/rejected": -7.116488456726074, + "loss": 0.2528, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.292171001434326, + "rewards/margins": 5.382561683654785, + "rewards/rejected": -10.674734115600586, + "step": 770 + }, + { + "epoch": 0.7166647524979901, + "grad_norm": 5.6951799392700195, + "learning_rate": 2.3430237011767166e-06, + "logits/chosen": 2.295801877975464, + "logits/rejected": 1.8810745477676392, + "logps/chosen": -3.3887531757354736, + "logps/rejected": -6.8857245445251465, + "loss": 0.2672, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.083128929138184, + "rewards/margins": 5.245457649230957, + "rewards/rejected": -10.32858657836914, + "step": 780 + }, + { + "epoch": 0.7258527621454003, + "grad_norm": 3.734528064727783, + "learning_rate": 2.290805391669212e-06, + "logits/chosen": 1.6154979467391968, + "logits/rejected": 1.1886816024780273, + "logps/chosen": -3.379617214202881, + "logps/rejected": -7.1727166175842285, + "loss": 0.2315, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.069426536560059, + "rewards/margins": 5.689648628234863, + "rewards/rejected": -10.759074211120605, + "step": 790 + }, + { + "epoch": 0.7350407717928104, + "grad_norm": 3.2450063228607178, + "learning_rate": 2.238678841830867e-06, + "logits/chosen": 1.9270827770233154, + "logits/rejected": 1.5745903253555298, + "logps/chosen": -3.9242210388183594, + "logps/rejected": -7.254105567932129, + "loss": 0.2301, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.886331081390381, + "rewards/margins": 4.9948272705078125, + "rewards/rejected": -10.881157875061035, + "step": 800 + }, + { + "epoch": 0.7350407717928104, + "eval_logits/chosen": 1.228877067565918, + "eval_logits/rejected": 0.7780414819717407, + "eval_logps/chosen": -3.096149444580078, + "eval_logps/rejected": -6.9255452156066895, + "eval_loss": 0.23762211203575134, + "eval_rewards/accuracies": 0.9090909361839294, + "eval_rewards/chosen": -4.644224166870117, + "eval_rewards/margins": 5.744093894958496, + "eval_rewards/rejected": -10.388318061828613, + "eval_runtime": 24.4202, + "eval_samples_per_second": 28.829, + "eval_steps_per_second": 3.604, + "step": 800 + }, + { + "epoch": 0.7442287814402205, + "grad_norm": 3.6619718074798584, + "learning_rate": 2.186666916089239e-06, + "logits/chosen": 2.1697134971618652, + "logits/rejected": 1.8535985946655273, + "logps/chosen": -3.306549549102783, + "logps/rejected": -6.570471286773682, + "loss": 0.2239, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.959824562072754, + "rewards/margins": 4.895882606506348, + "rewards/rejected": -9.855707168579102, + "step": 810 + }, + { + "epoch": 0.7534167910876306, + "grad_norm": 3.9014956951141357, + "learning_rate": 2.134792428593971e-06, + "logits/chosen": 1.8228479623794556, + "logits/rejected": 1.5440882444381714, + "logps/chosen": -3.545902967453003, + "logps/rejected": -7.393707275390625, + "loss": 0.2429, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.318853855133057, + "rewards/margins": 5.7717061042785645, + "rewards/rejected": -11.090560913085938, + "step": 820 + }, + { + "epoch": 0.7626048007350408, + "grad_norm": 4.266237735748291, + "learning_rate": 2.0830781332097446e-06, + "logits/chosen": 1.6186097860336304, + "logits/rejected": 1.4948759078979492, + "logps/chosen": -3.6410841941833496, + "logps/rejected": -7.069480895996094, + "loss": 0.2213, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.461626052856445, + "rewards/margins": 5.142596244812012, + "rewards/rejected": -10.604223251342773, + "step": 830 + }, + { + "epoch": 0.771792810382451, + "grad_norm": 2.3189163208007812, + "learning_rate": 2.031546713535688e-06, + "logits/chosen": 2.4443275928497314, + "logits/rejected": 2.1987624168395996, + "logps/chosen": -4.127468585968018, + "logps/rejected": -8.15340805053711, + "loss": 0.2525, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.191202640533447, + "rewards/margins": 6.038909435272217, + "rewards/rejected": -12.230112075805664, + "step": 840 + }, + { + "epoch": 0.780980820029861, + "grad_norm": 5.227460861206055, + "learning_rate": 1.9802207729556023e-06, + "logits/chosen": 2.0057969093322754, + "logits/rejected": 1.7481991052627563, + "logps/chosen": -3.890474796295166, + "logps/rejected": -7.659049034118652, + "loss": 0.2253, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.83571195602417, + "rewards/margins": 5.65286111831665, + "rewards/rejected": -11.48857307434082, + "step": 850 + }, + { + "epoch": 0.780980820029861, + "eval_logits/chosen": 1.2305774688720703, + "eval_logits/rejected": 0.8189592957496643, + "eval_logps/chosen": -3.205787420272827, + "eval_logps/rejected": -7.230924606323242, + "eval_loss": 0.23406219482421875, + "eval_rewards/accuracies": 0.9090909361839294, + "eval_rewards/chosen": -4.808681488037109, + "eval_rewards/margins": 6.0377044677734375, + "eval_rewards/rejected": -10.846386909484863, + "eval_runtime": 24.5257, + "eval_samples_per_second": 28.705, + "eval_steps_per_second": 3.588, + "step": 850 + }, + { + "epoch": 0.7901688296772712, + "grad_norm": 5.743741512298584, + "learning_rate": 1.9291228247233607e-06, + "logits/chosen": 2.0627987384796143, + "logits/rejected": 1.964015245437622, + "logps/chosen": -3.810974597930908, + "logps/rejected": -7.854369163513184, + "loss": 0.254, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.716461658477783, + "rewards/margins": 6.065093040466309, + "rewards/rejected": -11.78155517578125, + "step": 860 + }, + { + "epoch": 0.7993568393246813, + "grad_norm": 3.256955623626709, + "learning_rate": 1.8782752820878636e-06, + "logits/chosen": 1.892846703529358, + "logits/rejected": 1.5955915451049805, + "logps/chosen": -4.009222507476807, + "logps/rejected": -7.55028772354126, + "loss": 0.2325, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -6.013833045959473, + "rewards/margins": 5.3115973472595215, + "rewards/rejected": -11.325429916381836, + "step": 870 + }, + { + "epoch": 0.8085448489720914, + "grad_norm": 3.6304337978363037, + "learning_rate": 1.827700448461836e-06, + "logits/chosen": 2.3684587478637695, + "logits/rejected": 2.1417429447174072, + "logps/chosen": -3.8222625255584717, + "logps/rejected": -7.484266757965088, + "loss": 0.2427, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.73339319229126, + "rewards/margins": 5.493006229400635, + "rewards/rejected": -11.226400375366211, + "step": 880 + }, + { + "epoch": 0.8177328586195015, + "grad_norm": 4.88602352142334, + "learning_rate": 1.7774205076388207e-06, + "logits/chosen": 1.8633663654327393, + "logits/rejected": 1.6100108623504639, + "logps/chosen": -3.4299614429473877, + "logps/rejected": -7.589132785797119, + "loss": 0.1988, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.144942283630371, + "rewards/margins": 6.238757133483887, + "rewards/rejected": -11.383699417114258, + "step": 890 + }, + { + "epoch": 0.8269208682669117, + "grad_norm": 3.657270669937134, + "learning_rate": 1.7274575140626318e-06, + "logits/chosen": 1.5545165538787842, + "logits/rejected": 1.600629448890686, + "logps/chosen": -3.981847047805786, + "logps/rejected": -8.365175247192383, + "loss": 0.2055, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.972771167755127, + "rewards/margins": 6.5749921798706055, + "rewards/rejected": -12.54776382446289, + "step": 900 + }, + { + "epoch": 0.8269208682669117, + "eval_logits/chosen": 1.3828459978103638, + "eval_logits/rejected": 0.9709804058074951, + "eval_logps/chosen": -3.4689862728118896, + "eval_logps/rejected": -7.613832473754883, + "eval_loss": 0.2267654538154602, + "eval_rewards/accuracies": 0.9090909361839294, + "eval_rewards/chosen": -5.203479290008545, + "eval_rewards/margins": 6.217269420623779, + "eval_rewards/rejected": -11.420748710632324, + "eval_runtime": 24.422, + "eval_samples_per_second": 28.826, + "eval_steps_per_second": 3.603, + "step": 900 + }, + { + "epoch": 0.8361088779143218, + "grad_norm": 7.899631500244141, + "learning_rate": 1.677833383153542e-06, + "logits/chosen": 1.8724586963653564, + "logits/rejected": 1.4860353469848633, + "logps/chosen": -3.703522205352783, + "logps/rejected": -7.718357086181641, + "loss": 0.2762, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.5552825927734375, + "rewards/margins": 6.02225399017334, + "rewards/rejected": -11.577536582946777, + "step": 910 + }, + { + "epoch": 0.8452968875617319, + "grad_norm": 3.687998056411743, + "learning_rate": 1.6285698816954626e-06, + "logits/chosen": 3.0009334087371826, + "logits/rejected": 2.3279061317443848, + "logps/chosen": -4.25749397277832, + "logps/rejected": -7.3263139724731445, + "loss": 0.2202, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.3862409591674805, + "rewards/margins": 4.603229522705078, + "rewards/rejected": -10.989469528198242, + "step": 920 + }, + { + "epoch": 0.8544848972091421, + "grad_norm": 3.9033992290496826, + "learning_rate": 1.5796886182883053e-06, + "logits/chosen": 2.3675689697265625, + "logits/rejected": 2.2341208457946777, + "logps/chosen": -4.182201385498047, + "logps/rejected": -7.975285530090332, + "loss": 0.1945, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -6.27330207824707, + "rewards/margins": 5.6896257400512695, + "rewards/rejected": -11.962926864624023, + "step": 930 + }, + { + "epoch": 0.8636729068565522, + "grad_norm": 7.484313488006592, + "learning_rate": 1.5312110338697427e-06, + "logits/chosen": 1.737408995628357, + "logits/rejected": 1.6608823537826538, + "logps/chosen": -4.052768707275391, + "logps/rejected": -7.796560764312744, + "loss": 0.2282, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -6.079153537750244, + "rewards/margins": 5.615686893463135, + "rewards/rejected": -11.694841384887695, + "step": 940 + }, + { + "epoch": 0.8728609165039624, + "grad_norm": 5.176153182983398, + "learning_rate": 1.4831583923105e-06, + "logits/chosen": 1.5573890209197998, + "logits/rejected": 1.1255266666412354, + "logps/chosen": -3.52099609375, + "logps/rejected": -7.2554473876953125, + "loss": 0.2434, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.281493663787842, + "rewards/margins": 5.601677417755127, + "rewards/rejected": -10.883171081542969, + "step": 950 + }, + { + "epoch": 0.8728609165039624, + "eval_logits/chosen": 1.3714543581008911, + "eval_logits/rejected": 0.9610424041748047, + "eval_logps/chosen": -3.4983131885528564, + "eval_logps/rejected": -7.699583053588867, + "eval_loss": 0.22121396660804749, + "eval_rewards/accuracies": 0.9204545617103577, + "eval_rewards/chosen": -5.247470378875732, + "eval_rewards/margins": 6.301905632019043, + "eval_rewards/rejected": -11.549375534057617, + "eval_runtime": 24.4334, + "eval_samples_per_second": 28.813, + "eval_steps_per_second": 3.602, + "step": 950 + }, + { + "epoch": 0.8820489261513724, + "grad_norm": 3.092210292816162, + "learning_rate": 1.4355517710873184e-06, + "logits/chosen": 1.5538525581359863, + "logits/rejected": 1.2886962890625, + "logps/chosen": -3.7919907569885254, + "logps/rejected": -7.742431640625, + "loss": 0.2628, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.687986373901367, + "rewards/margins": 5.925662040710449, + "rewards/rejected": -11.613648414611816, + "step": 960 + }, + { + "epoch": 0.8912369357987826, + "grad_norm": 2.258793830871582, + "learning_rate": 1.388412052037682e-06, + "logits/chosen": 1.9938386678695679, + "logits/rejected": 1.7578411102294922, + "logps/chosen": -3.902163028717041, + "logps/rejected": -8.263092041015625, + "loss": 0.1893, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.853244781494141, + "rewards/margins": 6.541393280029297, + "rewards/rejected": -12.394639015197754, + "step": 970 + }, + { + "epoch": 0.9004249454461927, + "grad_norm": 3.3003509044647217, + "learning_rate": 1.3417599122003464e-06, + "logits/chosen": 1.858659029006958, + "logits/rejected": 1.7009245157241821, + "logps/chosen": -3.995110034942627, + "logps/rejected": -7.946316719055176, + "loss": 0.2001, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.9926652908325195, + "rewards/margins": 5.926809310913086, + "rewards/rejected": -11.919473648071289, + "step": 980 + }, + { + "epoch": 0.9096129550936028, + "grad_norm": 5.714742183685303, + "learning_rate": 1.2956158147457116e-06, + "logits/chosen": 2.311112642288208, + "logits/rejected": 1.819894790649414, + "logps/chosen": -4.011856555938721, + "logps/rejected": -8.002031326293945, + "loss": 0.216, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.01778507232666, + "rewards/margins": 5.985262870788574, + "rewards/rejected": -12.003047943115234, + "step": 990 + }, + { + "epoch": 0.918800964741013, + "grad_norm": 2.5513203144073486, + "learning_rate": 1.2500000000000007e-06, + "logits/chosen": 2.042844295501709, + "logits/rejected": 1.7135541439056396, + "logps/chosen": -4.096378803253174, + "logps/rejected": -8.864072799682617, + "loss": 0.1812, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.144567489624023, + "rewards/margins": 7.151541709899902, + "rewards/rejected": -13.296109199523926, + "step": 1000 + }, + { + "epoch": 0.918800964741013, + "eval_logits/chosen": 1.4344538450241089, + "eval_logits/rejected": 1.0740816593170166, + "eval_logps/chosen": -3.56337833404541, + "eval_logps/rejected": -7.854743957519531, + "eval_loss": 0.21498693525791168, + "eval_rewards/accuracies": 0.9204545617103577, + "eval_rewards/chosen": -5.345067977905273, + "eval_rewards/margins": 6.437047004699707, + "eval_rewards/rejected": -11.782115936279297, + "eval_runtime": 24.4225, + "eval_samples_per_second": 28.826, + "eval_steps_per_second": 3.603, + "step": 1000 + }, + { + "epoch": 0.9279889743884231, + "grad_norm": 3.213252067565918, + "learning_rate": 1.204932476567175e-06, + "logits/chosen": 2.064485788345337, + "logits/rejected": 1.9654451608657837, + "logps/chosen": -3.7592079639434814, + "logps/rejected": -8.004045486450195, + "loss": 0.2113, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.638812065124512, + "rewards/margins": 6.367256164550781, + "rewards/rejected": -12.00606918334961, + "step": 1010 + }, + { + "epoch": 0.9371769840358333, + "grad_norm": 3.000185012817383, + "learning_rate": 1.160433012552508e-06, + "logits/chosen": 1.9040180444717407, + "logits/rejected": 1.8388664722442627, + "logps/chosen": -4.3835272789001465, + "logps/rejected": -8.310102462768555, + "loss": 0.2414, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.575291633605957, + "rewards/margins": 5.889863014221191, + "rewards/rejected": -12.465154647827148, + "step": 1020 + }, + { + "epoch": 0.9463649936832433, + "grad_norm": 2.7824106216430664, + "learning_rate": 1.11652112689164e-06, + "logits/chosen": 1.4278347492218018, + "logits/rejected": 1.4083877801895142, + "logps/chosen": -3.6480088233947754, + "logps/rejected": -8.00098705291748, + "loss": 0.1763, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.472012519836426, + "rewards/margins": 6.5294671058654785, + "rewards/rejected": -12.001480102539062, + "step": 1030 + }, + { + "epoch": 0.9555530033306535, + "grad_norm": 5.236849784851074, + "learning_rate": 1.073216080788921e-06, + "logits/chosen": 2.1868062019348145, + "logits/rejected": 1.9401241540908813, + "logps/chosen": -4.442656993865967, + "logps/rejected": -8.857598304748535, + "loss": 0.2158, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.663985252380371, + "rewards/margins": 6.622411251068115, + "rewards/rejected": -13.286396980285645, + "step": 1040 + }, + { + "epoch": 0.9647410129780636, + "grad_norm": 3.4044084548950195, + "learning_rate": 1.0305368692688175e-06, + "logits/chosen": 1.5827648639678955, + "logits/rejected": 1.7356208562850952, + "logps/chosen": -3.988180637359619, + "logps/rejected": -8.252123832702637, + "loss": 0.1967, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.982270240783691, + "rewards/margins": 6.3959150314331055, + "rewards/rejected": -12.378186225891113, + "step": 1050 + }, + { + "epoch": 0.9647410129780636, + "eval_logits/chosen": 1.4267497062683105, + "eval_logits/rejected": 1.0997689962387085, + "eval_logps/chosen": -3.4779577255249023, + "eval_logps/rejected": -7.853654384613037, + "eval_loss": 0.20923514664173126, + "eval_rewards/accuracies": 0.9318181872367859, + "eval_rewards/chosen": -5.216937065124512, + "eval_rewards/margins": 6.563545227050781, + "eval_rewards/rejected": -11.780481338500977, + "eval_runtime": 24.446, + "eval_samples_per_second": 28.798, + "eval_steps_per_second": 3.6, + "step": 1050 + }, + { + "epoch": 0.9739290226254738, + "grad_norm": 7.323760032653809, + "learning_rate": 9.88502212844063e-07, + "logits/chosen": 1.8404476642608643, + "logits/rejected": 1.5995383262634277, + "logps/chosen": -3.8108818531036377, + "logps/rejected": -8.686467170715332, + "loss": 0.2182, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.716322898864746, + "rewards/margins": 7.313376426696777, + "rewards/rejected": -13.029699325561523, + "step": 1060 + }, + { + "epoch": 0.9831170322728839, + "grad_norm": 2.989016056060791, + "learning_rate": 9.471305493042243e-07, + "logits/chosen": 2.1675870418548584, + "logits/rejected": 2.089310646057129, + "logps/chosen": -3.7525386810302734, + "logps/rejected": -8.541536331176758, + "loss": 0.2056, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.628808498382568, + "rewards/margins": 7.183494567871094, + "rewards/rejected": -12.81230354309082, + "step": 1070 + }, + { + "epoch": 0.992305041920294, + "grad_norm": 5.650535583496094, + "learning_rate": 9.064400256282757e-07, + "logits/chosen": 2.612128496170044, + "logits/rejected": 2.198878765106201, + "logps/chosen": -4.110289096832275, + "logps/rejected": -8.18535327911377, + "loss": 0.1974, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.165433406829834, + "rewards/margins": 6.11259651184082, + "rewards/rejected": -12.278030395507812, + "step": 1080 + }, + { + "epoch": 1.000918800964741, + "grad_norm": 3.090592861175537, + "learning_rate": 8.664484900247363e-07, + "logits/chosen": 2.4326796531677246, + "logits/rejected": 2.0719056129455566, + "logps/chosen": -4.295161247253418, + "logps/rejected": -8.057024955749512, + "loss": 0.1982, + "rewards/accuracies": 0.9333333373069763, + "rewards/chosen": -6.442741394042969, + "rewards/margins": 5.642796039581299, + "rewards/rejected": -12.085537910461426, + "step": 1090 + }, + { + "epoch": 1.0101068106121511, + "grad_norm": 3.5762991905212402, + "learning_rate": 8.271734841028553e-07, + "logits/chosen": 1.808143973350525, + "logits/rejected": 1.3669540882110596, + "logps/chosen": -4.025475978851318, + "logps/rejected": -8.578283309936523, + "loss": 0.2476, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.038214683532715, + "rewards/margins": 6.829211235046387, + "rewards/rejected": -12.867425918579102, + "step": 1100 + }, + { + "epoch": 1.0101068106121511, + "eval_logits/chosen": 1.437761902809143, + "eval_logits/rejected": 1.1129838228225708, + "eval_logps/chosen": -3.463768720626831, + "eval_logps/rejected": -7.876412868499756, + "eval_loss": 0.2056656777858734, + "eval_rewards/accuracies": 0.9318181872367859, + "eval_rewards/chosen": -5.195653915405273, + "eval_rewards/margins": 6.6189656257629395, + "eval_rewards/rejected": -11.814618110656738, + "eval_runtime": 24.4283, + "eval_samples_per_second": 28.819, + "eval_steps_per_second": 3.602, + "step": 1100 + }, + { + "epoch": 1.0192948202595613, + "grad_norm": 3.994568109512329, + "learning_rate": 7.886322351782782e-07, + "logits/chosen": 2.0599985122680664, + "logits/rejected": 1.7133548259735107, + "logps/chosen": -4.200375080108643, + "logps/rejected": -8.241968154907227, + "loss": 0.1984, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.300562381744385, + "rewards/margins": 6.062388896942139, + "rewards/rejected": -12.36295223236084, + "step": 1110 + }, + { + "epoch": 1.0284828299069715, + "grad_norm": 4.166400909423828, + "learning_rate": 7.508416487165862e-07, + "logits/chosen": 1.7248255014419556, + "logits/rejected": 1.351927399635315, + "logps/chosen": -3.682168960571289, + "logps/rejected": -8.27522087097168, + "loss": 0.186, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.523253440856934, + "rewards/margins": 6.889578342437744, + "rewards/rejected": -12.412832260131836, + "step": 1120 + }, + { + "epoch": 1.0376708395543814, + "grad_norm": 4.695723533630371, + "learning_rate": 7.138183009179922e-07, + "logits/chosen": 2.1033718585968018, + "logits/rejected": 1.9854538440704346, + "logps/chosen": -4.065175533294678, + "logps/rejected": -8.255699157714844, + "loss": 0.1932, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.097762584686279, + "rewards/margins": 6.285785675048828, + "rewards/rejected": -12.38354778289795, + "step": 1130 + }, + { + "epoch": 1.0468588492017916, + "grad_norm": 4.183388710021973, + "learning_rate": 6.775784314464717e-07, + "logits/chosen": 2.612067699432373, + "logits/rejected": 2.1869616508483887, + "logps/chosen": -3.6629364490509033, + "logps/rejected": -8.047903060913086, + "loss": 0.2186, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.494403839111328, + "rewards/margins": 6.577451229095459, + "rewards/rejected": -12.071855545043945, + "step": 1140 + }, + { + "epoch": 1.0560468588492018, + "grad_norm": 5.9020514488220215, + "learning_rate": 6.421379363065142e-07, + "logits/chosen": 1.2938798666000366, + "logits/rejected": 1.1452168226242065, + "logps/chosen": -4.144273281097412, + "logps/rejected": -8.718961715698242, + "loss": 0.2127, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -6.216409206390381, + "rewards/margins": 6.862032890319824, + "rewards/rejected": -13.078442573547363, + "step": 1150 + }, + { + "epoch": 1.0560468588492018, + "eval_logits/chosen": 1.4493238925933838, + "eval_logits/rejected": 1.128269076347351, + "eval_logps/chosen": -3.538313150405884, + "eval_logps/rejected": -7.989621162414551, + "eval_loss": 0.20314906537532806, + "eval_rewards/accuracies": 0.9318181872367859, + "eval_rewards/chosen": -5.307469367980957, + "eval_rewards/margins": 6.676960468292236, + "eval_rewards/rejected": -11.984430313110352, + "eval_runtime": 24.6248, + "eval_samples_per_second": 28.589, + "eval_steps_per_second": 3.574, + "step": 1150 + }, + { + "epoch": 1.065234868496612, + "grad_norm": 3.6220788955688477, + "learning_rate": 6.075123608706093e-07, + "logits/chosen": 1.9196579456329346, + "logits/rejected": 1.8552544116973877, + "logps/chosen": -3.56549334526062, + "logps/rejected": -8.362249374389648, + "loss": 0.2277, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.348240852355957, + "rewards/margins": 7.195133209228516, + "rewards/rejected": -12.543375015258789, + "step": 1160 + }, + { + "epoch": 1.0744228781440222, + "grad_norm": 3.9738340377807617, + "learning_rate": 5.737168930605272e-07, + "logits/chosen": 2.693675994873047, + "logits/rejected": 2.596327543258667, + "logps/chosen": -3.953673839569092, + "logps/rejected": -8.083026885986328, + "loss": 0.2156, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.930511474609375, + "rewards/margins": 6.194031238555908, + "rewards/rejected": -12.124540328979492, + "step": 1170 + }, + { + "epoch": 1.083610887791432, + "grad_norm": 3.9082953929901123, + "learning_rate": 5.407663566854008e-07, + "logits/chosen": 2.457482099533081, + "logits/rejected": 2.206406354904175, + "logps/chosen": -4.224593162536621, + "logps/rejected": -8.104263305664062, + "loss": 0.1883, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.336889266967773, + "rewards/margins": 5.8195061683654785, + "rewards/rejected": -12.15639591217041, + "step": 1180 + }, + { + "epoch": 1.0927988974388423, + "grad_norm": 2.5964362621307373, + "learning_rate": 5.086752049395094e-07, + "logits/chosen": 1.9967896938323975, + "logits/rejected": 1.618037462234497, + "logps/chosen": -3.5683434009552, + "logps/rejected": -7.307412624359131, + "loss": 0.1798, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.352514743804932, + "rewards/margins": 5.608603477478027, + "rewards/rejected": -10.961118698120117, + "step": 1190 + }, + { + "epoch": 1.1019869070862525, + "grad_norm": 2.499810218811035, + "learning_rate": 4.774575140626317e-07, + "logits/chosen": 1.7958850860595703, + "logits/rejected": 1.480818748474121, + "logps/chosen": -3.8547580242156982, + "logps/rejected": -8.144584655761719, + "loss": 0.2005, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.7821364402771, + "rewards/margins": 6.4347405433654785, + "rewards/rejected": -12.216877937316895, + "step": 1200 + }, + { + "epoch": 1.1019869070862525, + "eval_logits/chosen": 1.4727997779846191, + "eval_logits/rejected": 1.1707344055175781, + "eval_logps/chosen": -3.532540798187256, + "eval_logps/rejected": -8.002986907958984, + "eval_loss": 0.2008148729801178, + "eval_rewards/accuracies": 0.9204545617103577, + "eval_rewards/chosen": -5.298811435699463, + "eval_rewards/margins": 6.705668926239014, + "eval_rewards/rejected": -12.004480361938477, + "eval_runtime": 24.695, + "eval_samples_per_second": 28.508, + "eval_steps_per_second": 3.563, + "step": 1200 + }, + { + "epoch": 1.1111749167336626, + "grad_norm": 3.8940417766571045, + "learning_rate": 4.4712697716573994e-07, + "logits/chosen": 2.203734874725342, + "logits/rejected": 2.0729122161865234, + "logps/chosen": -4.096261501312256, + "logps/rejected": -7.813987731933594, + "loss": 0.1796, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.144392490386963, + "rewards/margins": 5.5765886306762695, + "rewards/rejected": -11.720980644226074, + "step": 1210 + }, + { + "epoch": 1.1203629263810726, + "grad_norm": 4.197968006134033, + "learning_rate": 4.1769689822475147e-07, + "logits/chosen": 1.5070486068725586, + "logits/rejected": 1.5961544513702393, + "logps/chosen": -3.433948040008545, + "logps/rejected": -8.33233642578125, + "loss": 0.1742, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.1509222984313965, + "rewards/margins": 7.3475823402404785, + "rewards/rejected": -12.498504638671875, + "step": 1220 + }, + { + "epoch": 1.1295509360284828, + "grad_norm": 3.927893877029419, + "learning_rate": 3.891801862449629e-07, + "logits/chosen": 2.136690616607666, + "logits/rejected": 2.030285120010376, + "logps/chosen": -4.768858909606934, + "logps/rejected": -8.714016914367676, + "loss": 0.1811, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.153287410736084, + "rewards/margins": 5.917738914489746, + "rewards/rejected": -13.071026802062988, + "step": 1230 + }, + { + "epoch": 1.138738945675893, + "grad_norm": 3.148071765899658, + "learning_rate": 3.615893495987335e-07, + "logits/chosen": 2.084843873977661, + "logits/rejected": 2.249072551727295, + "logps/chosen": -4.252139091491699, + "logps/rejected": -8.39255428314209, + "loss": 0.1942, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.378207683563232, + "rewards/margins": 6.210625648498535, + "rewards/rejected": -12.588833808898926, + "step": 1240 + }, + { + "epoch": 1.1479269553233031, + "grad_norm": 1.6102039813995361, + "learning_rate": 3.3493649053890325e-07, + "logits/chosen": 2.5606529712677, + "logits/rejected": 2.4942524433135986, + "logps/chosen": -4.308623313903809, + "logps/rejected": -8.170819282531738, + "loss": 0.2088, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.4629340171813965, + "rewards/margins": 5.793294429779053, + "rewards/rejected": -12.256229400634766, + "step": 1250 + }, + { + "epoch": 1.1479269553233031, + "eval_logits/chosen": 1.4887795448303223, + "eval_logits/rejected": 1.181876540184021, + "eval_logps/chosen": -3.5690500736236572, + "eval_logps/rejected": -8.063680648803711, + "eval_loss": 0.20090900361537933, + "eval_rewards/accuracies": 0.9318181872367859, + "eval_rewards/chosen": -5.353575229644775, + "eval_rewards/margins": 6.741945266723633, + "eval_rewards/rejected": -12.09552001953125, + "eval_runtime": 24.433, + "eval_samples_per_second": 28.813, + "eval_steps_per_second": 3.602, + "step": 1250 + }, + { + "epoch": 1.1571149649707133, + "grad_norm": 5.161710262298584, + "learning_rate": 3.092332998903416e-07, + "logits/chosen": 1.6824312210083008, + "logits/rejected": 1.8732401132583618, + "logps/chosen": -4.1795806884765625, + "logps/rejected": -8.756204605102539, + "loss": 0.1833, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -6.269371032714844, + "rewards/margins": 6.864935874938965, + "rewards/rejected": -13.134305953979492, + "step": 1260 + }, + { + "epoch": 1.1663029746181233, + "grad_norm": 11.763586044311523, + "learning_rate": 2.844910519219632e-07, + "logits/chosen": 1.6715238094329834, + "logits/rejected": 1.2933677434921265, + "logps/chosen": -3.8149237632751465, + "logps/rejected": -8.005244255065918, + "loss": 0.2073, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.722386360168457, + "rewards/margins": 6.285480499267578, + "rewards/rejected": -12.007866859436035, + "step": 1270 + }, + { + "epoch": 1.1754909842655334, + "grad_norm": 3.0442380905151367, + "learning_rate": 2.6072059940146775e-07, + "logits/chosen": 1.210684061050415, + "logits/rejected": 1.0442299842834473, + "logps/chosen": -4.0504326820373535, + "logps/rejected": -8.990427017211914, + "loss": 0.2031, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.075650215148926, + "rewards/margins": 7.409991264343262, + "rewards/rejected": -13.485639572143555, + "step": 1280 + }, + { + "epoch": 1.1846789939129436, + "grad_norm": 2.9671738147735596, + "learning_rate": 2.3793236883495164e-07, + "logits/chosen": 2.123716115951538, + "logits/rejected": 1.7563165426254272, + "logps/chosen": -4.090588092803955, + "logps/rejected": -8.25685977935791, + "loss": 0.2191, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.1358819007873535, + "rewards/margins": 6.249407768249512, + "rewards/rejected": -12.385289192199707, + "step": 1290 + }, + { + "epoch": 1.1938670035603538, + "grad_norm": 5.619577407836914, + "learning_rate": 2.1613635589349756e-07, + "logits/chosen": 2.250432252883911, + "logits/rejected": 2.119335174560547, + "logps/chosen": -4.006811141967773, + "logps/rejected": -7.816763401031494, + "loss": 0.2523, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.01021671295166, + "rewards/margins": 5.71492862701416, + "rewards/rejected": -11.72514533996582, + "step": 1300 + }, + { + "epoch": 1.1938670035603538, + "eval_logits/chosen": 1.4931063652038574, + "eval_logits/rejected": 1.1866812705993652, + "eval_logps/chosen": -3.5401999950408936, + "eval_logps/rejected": -8.020904541015625, + "eval_loss": 0.20123924314975739, + "eval_rewards/accuracies": 0.9204545617103577, + "eval_rewards/chosen": -5.310299873352051, + "eval_rewards/margins": 6.721057415008545, + "eval_rewards/rejected": -12.031357765197754, + "eval_runtime": 24.4186, + "eval_samples_per_second": 28.83, + "eval_steps_per_second": 3.604, + "step": 1300 + }, + { + "epoch": 1.2030550132077638, + "grad_norm": 5.24505615234375, + "learning_rate": 1.95342121028749e-07, + "logits/chosen": 2.028916597366333, + "logits/rejected": 1.8642489910125732, + "logps/chosen": -3.7371432781219482, + "logps/rejected": -8.189066886901855, + "loss": 0.2, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.605715274810791, + "rewards/margins": 6.67788553237915, + "rewards/rejected": -12.283600807189941, + "step": 1310 + }, + { + "epoch": 1.212243022855174, + "grad_norm": 2.5810768604278564, + "learning_rate": 1.7555878527937164e-07, + "logits/chosen": 2.4587361812591553, + "logits/rejected": 2.419093608856201, + "logps/chosen": -3.863225221633911, + "logps/rejected": -8.523834228515625, + "loss": 0.1589, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.7948384284973145, + "rewards/margins": 6.990914821624756, + "rewards/rejected": -12.785752296447754, + "step": 1320 + }, + { + "epoch": 1.2214310325025841, + "grad_norm": 1.403421401977539, + "learning_rate": 1.567950262702714e-07, + "logits/chosen": 1.8351008892059326, + "logits/rejected": 1.7053043842315674, + "logps/chosen": -4.444234371185303, + "logps/rejected": -8.667207717895508, + "loss": 0.1793, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -6.666351318359375, + "rewards/margins": 6.334461212158203, + "rewards/rejected": -13.000813484191895, + "step": 1330 + }, + { + "epoch": 1.2306190421499943, + "grad_norm": 3.304309368133545, + "learning_rate": 1.3905907440629752e-07, + "logits/chosen": 2.251183032989502, + "logits/rejected": 2.064572811126709, + "logps/chosen": -4.050612449645996, + "logps/rejected": -8.985613822937012, + "loss": 0.1779, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -6.075918674468994, + "rewards/margins": 7.40250301361084, + "rewards/rejected": -13.478421211242676, + "step": 1340 + }, + { + "epoch": 1.2398070517974045, + "grad_norm": 2.867111921310425, + "learning_rate": 1.223587092621162e-07, + "logits/chosen": 1.5418930053710938, + "logits/rejected": 1.4867240190505981, + "logps/chosen": -4.031666278839111, + "logps/rejected": -8.506909370422363, + "loss": 0.1736, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.047499179840088, + "rewards/margins": 6.712864875793457, + "rewards/rejected": -12.760363578796387, + "step": 1350 + }, + { + "epoch": 1.2398070517974045, + "eval_logits/chosen": 1.4972354173660278, + "eval_logits/rejected": 1.1784640550613403, + "eval_logps/chosen": -3.556915283203125, + "eval_logps/rejected": -8.036223411560059, + "eval_loss": 0.19999442994594574, + "eval_rewards/accuracies": 0.9318181872367859, + "eval_rewards/chosen": -5.335372447967529, + "eval_rewards/margins": 6.718961715698242, + "eval_rewards/rejected": -12.054333686828613, + "eval_runtime": 24.4074, + "eval_samples_per_second": 28.844, + "eval_steps_per_second": 3.605, + "step": 1350 + }, + { + "epoch": 1.2489950614448144, + "grad_norm": 2.1836965084075928, + "learning_rate": 1.067012561698319e-07, + "logits/chosen": 2.2363076210021973, + "logits/rejected": 2.0226285457611084, + "logps/chosen": -4.427682876586914, + "logps/rejected": -8.549234390258789, + "loss": 0.2027, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.641523838043213, + "rewards/margins": 6.1823272705078125, + "rewards/rejected": -12.823850631713867, + "step": 1360 + }, + { + "epoch": 1.2581830710922246, + "grad_norm": 7.196364402770996, + "learning_rate": 9.209358300585474e-08, + "logits/chosen": 2.689990758895874, + "logits/rejected": 2.3472132682800293, + "logps/chosen": -3.9703564643859863, + "logps/rejected": -8.000662803649902, + "loss": 0.1962, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.9555344581604, + "rewards/margins": 6.0454607009887695, + "rewards/rejected": -12.000995635986328, + "step": 1370 + }, + { + "epoch": 1.2673710807396348, + "grad_norm": 8.442782402038574, + "learning_rate": 7.854209717842231e-08, + "logits/chosen": 2.107114315032959, + "logits/rejected": 1.749751091003418, + "logps/chosen": -3.971087694168091, + "logps/rejected": -8.122562408447266, + "loss": 0.2435, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.956631183624268, + "rewards/margins": 6.227211952209473, + "rewards/rejected": -12.183842658996582, + "step": 1380 + }, + { + "epoch": 1.276559090387045, + "grad_norm": 5.254242897033691, + "learning_rate": 6.605274281709929e-08, + "logits/chosen": 2.106802463531494, + "logits/rejected": 1.8030809164047241, + "logps/chosen": -4.336814880371094, + "logps/rejected": -8.912458419799805, + "loss": 0.2051, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -6.505222320556641, + "rewards/margins": 6.863465785980225, + "rewards/rejected": -13.368688583374023, + "step": 1390 + }, + { + "epoch": 1.2857471000344551, + "grad_norm": 5.613115310668945, + "learning_rate": 5.463099816548578e-08, + "logits/chosen": 1.6433700323104858, + "logits/rejected": 1.274550437927246, + "logps/chosen": -3.915881633758545, + "logps/rejected": -7.747851371765137, + "loss": 0.2139, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.873822212219238, + "rewards/margins": 5.747954368591309, + "rewards/rejected": -11.62177848815918, + "step": 1400 + }, + { + "epoch": 1.2857471000344551, + "eval_logits/chosen": 1.4849358797073364, + "eval_logits/rejected": 1.166409969329834, + "eval_logps/chosen": -3.554471492767334, + "eval_logps/rejected": -8.035895347595215, + "eval_loss": 0.2001064419746399, + "eval_rewards/accuracies": 0.9318181872367859, + "eval_rewards/chosen": -5.33170747756958, + "eval_rewards/margins": 6.7221360206604, + "eval_rewards/rejected": -12.053844451904297, + "eval_runtime": 24.4239, + "eval_samples_per_second": 28.824, + "eval_steps_per_second": 3.603, + "step": 1400 + }, + { + "epoch": 1.294935109681865, + "grad_norm": 3.2592716217041016, + "learning_rate": 4.428187317827848e-08, + "logits/chosen": 1.6605371236801147, + "logits/rejected": 1.512782335281372, + "logps/chosen": -4.016318321228027, + "logps/rejected": -8.222332954406738, + "loss": 0.1929, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.024477481842041, + "rewards/margins": 6.30902099609375, + "rewards/rejected": -12.33349895477295, + "step": 1410 + }, + { + "epoch": 1.3041231193292753, + "grad_norm": 3.8764472007751465, + "learning_rate": 3.5009907323737826e-08, + "logits/chosen": 1.6769720315933228, + "logits/rejected": 1.466618299484253, + "logps/chosen": -3.6894333362579346, + "logps/rejected": -7.910569190979004, + "loss": 0.186, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.534149646759033, + "rewards/margins": 6.331704139709473, + "rewards/rejected": -11.865854263305664, + "step": 1420 + }, + { + "epoch": 1.3133111289766854, + "grad_norm": 3.0787954330444336, + "learning_rate": 2.681916759252917e-08, + "logits/chosen": 2.332460403442383, + "logits/rejected": 1.9827533960342407, + "logps/chosen": -4.545821189880371, + "logps/rejected": -8.547601699829102, + "loss": 0.2139, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.818732261657715, + "rewards/margins": 6.002669811248779, + "rewards/rejected": -12.821401596069336, + "step": 1430 + }, + { + "epoch": 1.3224991386240956, + "grad_norm": 4.644412994384766, + "learning_rate": 1.9713246713805588e-08, + "logits/chosen": 2.217742443084717, + "logits/rejected": 1.9464161396026611, + "logps/chosen": -3.950064182281494, + "logps/rejected": -8.264387130737305, + "loss": 0.199, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.92509651184082, + "rewards/margins": 6.4714837074279785, + "rewards/rejected": -12.396580696105957, + "step": 1440 + }, + { + "epoch": 1.3316871482715058, + "grad_norm": 4.489632606506348, + "learning_rate": 1.3695261579316776e-08, + "logits/chosen": 1.3916233777999878, + "logits/rejected": 1.4683945178985596, + "logps/chosen": -4.025264739990234, + "logps/rejected": -9.182435989379883, + "loss": 0.2255, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -6.037897109985352, + "rewards/margins": 7.735756874084473, + "rewards/rejected": -13.773653984069824, + "step": 1450 + }, + { + "epoch": 1.3316871482715058, + "eval_logits/chosen": 1.4939136505126953, + "eval_logits/rejected": 1.1826014518737793, + "eval_logps/chosen": -3.5562257766723633, + "eval_logps/rejected": -8.059898376464844, + "eval_loss": 0.19852806627750397, + "eval_rewards/accuracies": 0.9431818127632141, + "eval_rewards/chosen": -5.334338665008545, + "eval_rewards/margins": 6.755507946014404, + "eval_rewards/rejected": -12.08984661102295, + "eval_runtime": 24.3883, + "eval_samples_per_second": 28.866, + "eval_steps_per_second": 3.608, + "step": 1450 + }, + { + "epoch": 1.3408751579189158, + "grad_norm": 4.516350746154785, + "learning_rate": 8.767851876239075e-09, + "logits/chosen": 2.247871160507202, + "logits/rejected": 2.161593198776245, + "logps/chosen": -4.095339298248291, + "logps/rejected": -8.592370986938477, + "loss": 0.1777, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.143008232116699, + "rewards/margins": 6.745549201965332, + "rewards/rejected": -12.888557434082031, + "step": 1460 + }, + { + "epoch": 1.350063167566326, + "grad_norm": 3.5604116916656494, + "learning_rate": 4.933178929321103e-09, + "logits/chosen": 2.272150754928589, + "logits/rejected": 2.137777090072632, + "logps/chosen": -4.238589286804199, + "logps/rejected": -8.042024612426758, + "loss": 0.2021, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.357884407043457, + "rewards/margins": 5.70515251159668, + "rewards/rejected": -12.063036918640137, + "step": 1470 + }, + { + "epoch": 1.3592511772137361, + "grad_norm": 1.878821849822998, + "learning_rate": 2.192924752854042e-09, + "logits/chosen": 2.2196030616760254, + "logits/rejected": 1.7679128646850586, + "logps/chosen": -4.042660236358643, + "logps/rejected": -8.428380012512207, + "loss": 0.1908, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.063990592956543, + "rewards/margins": 6.578577995300293, + "rewards/rejected": -12.642568588256836, + "step": 1480 + }, + { + "epoch": 1.368439186861146, + "grad_norm": 4.727602958679199, + "learning_rate": 5.48291312886251e-10, + "logits/chosen": 2.2299444675445557, + "logits/rejected": 2.112347364425659, + "logps/chosen": -4.057961463928223, + "logps/rejected": -7.997668266296387, + "loss": 0.1912, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -6.086942672729492, + "rewards/margins": 5.909560203552246, + "rewards/rejected": -11.996503829956055, + "step": 1490 + }, + { + "epoch": 1.3776271965085563, + "grad_norm": 3.4773285388946533, + "learning_rate": 0.0, + "logits/chosen": 2.782447576522827, + "logits/rejected": 2.5681684017181396, + "logps/chosen": -4.227536678314209, + "logps/rejected": -8.583551406860352, + "loss": 0.2239, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -6.341304779052734, + "rewards/margins": 6.534020900726318, + "rewards/rejected": -12.875326156616211, + "step": 1500 + }, + { + "epoch": 1.3776271965085563, + "eval_logits/chosen": 1.4909855127334595, + "eval_logits/rejected": 1.1811962127685547, + "eval_logps/chosen": -3.5707902908325195, + "eval_logps/rejected": -8.077098846435547, + "eval_loss": 0.2004113346338272, + "eval_rewards/accuracies": 0.9318181872367859, + "eval_rewards/chosen": -5.3561859130859375, + "eval_rewards/margins": 6.759463310241699, + "eval_rewards/rejected": -12.115647315979004, + "eval_runtime": 24.4303, + "eval_samples_per_second": 28.817, + "eval_steps_per_second": 3.602, + "step": 1500 + } + ], + "logging_steps": 10, + "max_steps": 1500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.642599643201667e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}